diff --git a/src/arch/amdgpu/vega/SConscript b/src/arch/amdgpu/vega/SConscript
index 019ef279b3..b7a28a8d6c 100644
--- a/src/arch/amdgpu/vega/SConscript
+++ b/src/arch/amdgpu/vega/SConscript
@@ -49,13 +49,32 @@ Source('tlb_coalescer.cc')
 DebugFlag('GPUPTWalker', 'Debug flag for GPU page table walker')
 
 if env['CONF']['TARGET_GPU_ISA'] == 'vega':
-    Source('decoder.cc')
+    Source('gpu_decoder.cc')
     Source('insts/gpu_static_inst.cc')
     Source('insts/instructions.cc')
     Source('insts/op_encodings.cc')
-    Source('isa.cc')
-    Source('registers.cc')
+    Source('gpu_isa.cc')
+    Source('gpu_registers.cc')
 
+    Source('insts/sop2.cc')
+    Source('insts/sopk.cc')
+    Source('insts/sop1.cc')
+    Source('insts/sopc.cc')
+    Source('insts/sopp.cc')
+    Source('insts/smem.cc')
+    Source('insts/vop2.cc')
+    Source('insts/vop1.cc')
+    Source('insts/vopc.cc')
+    Source('insts/vinterp.cc')
+    Source('insts/vop3.cc')
+    Source('insts/vop3_cmp.cc')
+    Source('insts/ds.cc')
+    Source('insts/mubuf.cc')
+    Source('insts/mtbuf.cc')
+    Source('insts/mimg.cc')
+    Source('insts/exp.cc')
+    Source('insts/flat.cc')
     Source('insts/vop3p.cc')
+    Source('insts/vop3p_mai.cc')
 
     DebugFlag('VEGA', 'Debug flag for VEGA GPU ISA')
diff --git a/src/arch/amdgpu/vega/decoder.cc b/src/arch/amdgpu/vega/gpu_decoder.cc
similarity index 99%
rename from src/arch/amdgpu/vega/decoder.cc
rename to src/arch/amdgpu/vega/gpu_decoder.cc
index 5e2402a4af..940840719b 100644
--- a/src/arch/amdgpu/vega/decoder.cc
+++ b/src/arch/amdgpu/vega/gpu_decoder.cc
@@ -29,9 +29,10 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include "arch/amdgpu/vega/gpu_decoder.hh"
+
 #include <vector>
 
-#include "arch/amdgpu/vega/gpu_decoder.hh"
 #include "arch/amdgpu/vega/insts/gpu_static_inst.hh"
 #include "arch/amdgpu/vega/insts/instructions.hh"
 #include "arch/amdgpu/vega/insts/vop3p.hh"
diff --git a/src/arch/amdgpu/vega/isa.cc b/src/arch/amdgpu/vega/gpu_isa.cc
similarity index 100%
rename from src/arch/amdgpu/vega/isa.cc
rename to src/arch/amdgpu/vega/gpu_isa.cc
diff --git a/src/arch/amdgpu/vega/registers.cc b/src/arch/amdgpu/vega/gpu_registers.cc
similarity index 100%
rename from src/arch/amdgpu/vega/registers.cc
rename to src/arch/amdgpu/vega/gpu_registers.cc
diff --git a/src/arch/amdgpu/vega/insts/ds.cc b/src/arch/amdgpu/vega/insts/ds.cc
new file mode 100644
index 0000000000..17acdaa287
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/ds.cc
@@ -0,0 +1,4657 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_DS__DS_ADD_U32 class methods ---
+
+    Inst_DS__DS_ADD_U32::Inst_DS__DS_ADD_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_add_u32")
+    {
+        setFlag(MemoryRef);
+        setFlag(GroupSegment);
+        setFlag(AtomicAdd);
+        setFlag(AtomicNoReturn);
+    } // Inst_DS__DS_ADD_U32
+
+    Inst_DS__DS_ADD_U32::~Inst_DS__DS_ADD_U32()
+    {
+    } // ~Inst_DS__DS_ADD_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // MEM[ADDR] += DATA;
+    void
+    Inst_DS__DS_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
+
+        addr.read();
+        data.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_ADD_U32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initAtomicAccess<VecElemU32>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_ADD_U32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_SUB_U32 class methods ---
+
+    Inst_DS__DS_SUB_U32::Inst_DS__DS_SUB_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_sub_u32")
+    {
+    } // Inst_DS__DS_SUB_U32
+
+    Inst_DS__DS_SUB_U32::~Inst_DS__DS_SUB_U32()
+    {
+    } // ~Inst_DS__DS_SUB_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_RSUB_U32 class methods ---
+
+    Inst_DS__DS_RSUB_U32::Inst_DS__DS_RSUB_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_rsub_u32")
+    {
+    } // Inst_DS__DS_RSUB_U32
+
+    Inst_DS__DS_RSUB_U32::~Inst_DS__DS_RSUB_U32()
+    {
+    } // ~Inst_DS__DS_RSUB_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA - MEM[ADDR];
+    // RETURN_DATA = tmp.
+    // Subtraction with reversed operands.
+    void
+    Inst_DS__DS_RSUB_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_INC_U32 class methods ---
+
+    Inst_DS__DS_INC_U32::Inst_DS__DS_INC_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_inc_u32")
+    {
+    } // Inst_DS__DS_INC_U32
+
+    Inst_DS__DS_INC_U32::~Inst_DS__DS_INC_U32()
+    {
+    } // ~Inst_DS__DS_INC_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_INC_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_DEC_U32 class methods ---
+
+    Inst_DS__DS_DEC_U32::Inst_DS__DS_DEC_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_dec_u32")
+    {
+    } // Inst_DS__DS_DEC_U32
+
+    Inst_DS__DS_DEC_U32::~Inst_DS__DS_DEC_U32()
+    {
+    } // ~Inst_DS__DS_DEC_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
+    // (unsigned compare); RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_DEC_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_I32 class methods ---
+
+    Inst_DS__DS_MIN_I32::Inst_DS__DS_MIN_I32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_i32")
+    {
+    } // Inst_DS__DS_MIN_I32
+
+    Inst_DS__DS_MIN_I32::~Inst_DS__DS_MIN_I32()
+    {
+    } // ~Inst_DS__DS_MIN_I32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_I32 class methods ---
+
+    Inst_DS__DS_MAX_I32::Inst_DS__DS_MAX_I32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_i32")
+    {
+    } // Inst_DS__DS_MAX_I32
+
+    Inst_DS__DS_MAX_I32::~Inst_DS__DS_MAX_I32()
+    {
+    } // ~Inst_DS__DS_MAX_I32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_U32 class methods ---
+
+    Inst_DS__DS_MIN_U32::Inst_DS__DS_MIN_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_u32")
+    {
+    } // Inst_DS__DS_MIN_U32
+
+    Inst_DS__DS_MIN_U32::~Inst_DS__DS_MIN_U32()
+    {
+    } // ~Inst_DS__DS_MIN_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_U32 class methods ---
+
+    Inst_DS__DS_MAX_U32::Inst_DS__DS_MAX_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_u32")
+    {
+    } // Inst_DS__DS_MAX_U32
+
+    Inst_DS__DS_MAX_U32::~Inst_DS__DS_MAX_U32()
+    {
+    } // ~Inst_DS__DS_MAX_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_AND_B32 class methods ---
+
+    Inst_DS__DS_AND_B32::Inst_DS__DS_AND_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_and_b32")
+    {
+    } // Inst_DS__DS_AND_B32
+
+    Inst_DS__DS_AND_B32::~Inst_DS__DS_AND_B32()
+    {
+    } // ~Inst_DS__DS_AND_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] &= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_AND_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_OR_B32 class methods ---
+
+    Inst_DS__DS_OR_B32::Inst_DS__DS_OR_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_or_b32")
+    {
+        setFlag(MemoryRef);
+        setFlag(GroupSegment);
+        setFlag(AtomicOr);
+        setFlag(AtomicNoReturn);
+    } // Inst_DS__DS_OR_B32
+
+    Inst_DS__DS_OR_B32::~Inst_DS__DS_OR_B32()
+    {
+    } // ~Inst_DS__DS_OR_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // MEM[ADDR] |= DATA;
+    void
+    Inst_DS__DS_OR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
+
+        addr.read();
+        data.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_OR_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initAtomicAccess<VecElemU32>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_OR_B32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+
+    // --- Inst_DS__DS_XOR_B32 class methods ---
+
+    Inst_DS__DS_XOR_B32::Inst_DS__DS_XOR_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_xor_b32")
+    {
+    } // Inst_DS__DS_XOR_B32
+
+    Inst_DS__DS_XOR_B32::~Inst_DS__DS_XOR_B32()
+    {
+    } // ~Inst_DS__DS_XOR_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] ^= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MSKOR_B32 class methods ---
+
+    Inst_DS__DS_MSKOR_B32::Inst_DS__DS_MSKOR_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_mskor_b32")
+    {
+    } // Inst_DS__DS_MSKOR_B32
+
+    Inst_DS__DS_MSKOR_B32::~Inst_DS__DS_MSKOR_B32()
+    {
+    } // ~Inst_DS__DS_MSKOR_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
+    // RETURN_DATA = tmp.
+    // Masked dword OR, D0 contains the mask and D1 contains the new value.
+    void
+    Inst_DS__DS_MSKOR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRITE_B32 class methods ---
+
+    Inst_DS__DS_WRITE_B32::Inst_DS__DS_WRITE_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write_b32")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE_B32
+
+    Inst_DS__DS_WRITE_B32::~Inst_DS__DS_WRITE_B32()
+    {
+    } // ~Inst_DS__DS_WRITE_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // MEM[ADDR] = DATA.
+    // Write dword.
+    void
+    Inst_DS__DS_WRITE_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
+
+        addr.read();
+        data.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemWrite<VecElemU32>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_WRITE_B32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_WRITE2_B32 class methods ---
+
+    Inst_DS__DS_WRITE2_B32::Inst_DS__DS_WRITE2_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write2_b32")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE2_B32
+
+    Inst_DS__DS_WRITE2_B32::~Inst_DS__DS_WRITE2_B32()
+    {
+    } // ~Inst_DS__DS_WRITE2_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // MEM[ADDR_BASE + OFFSET0 * 4] = DATA;
+    // MEM[ADDR_BASE + OFFSET1 * 4] = DATA2.
+    // Write 2 dwords.
+    void
+    Inst_DS__DS_WRITE2_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
+        ConstVecOperandU32 data1(gpuDynInst, extData.DATA1);
+
+        addr.read();
+        data0.read();
+        data1.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 2]
+                    = data0[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE2_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0 * 4;
+        Addr offset1 = instData.OFFSET1 * 4;
+
+        initDualMemWrite<VecElemU32>(gpuDynInst, offset0, offset1);
+    }
+
+    void
+    Inst_DS__DS_WRITE2_B32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    }
+    // --- Inst_DS__DS_WRITE2ST64_B32 class methods ---
+
+    Inst_DS__DS_WRITE2ST64_B32::Inst_DS__DS_WRITE2ST64_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write2st64_b32")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE2ST64_B32
+
+    Inst_DS__DS_WRITE2ST64_B32::~Inst_DS__DS_WRITE2ST64_B32()
+    {
+    } // ~Inst_DS__DS_WRITE2ST64_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // MEM[ADDR_BASE + OFFSET0 * 4 * 64] = DATA;
+    // MEM[ADDR_BASE + OFFSET1 * 4 * 64] = DATA2;
+    // Write 2 dwords.
+    void
+    Inst_DS__DS_WRITE2ST64_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
+        ConstVecOperandU32 data1(gpuDynInst, extData.DATA1);
+
+        addr.read();
+        data0.read();
+        data1.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 2]
+                    = data0[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE2ST64_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0 * 4 * 64;
+        Addr offset1 = instData.OFFSET1 * 4 * 64;
+
+        initDualMemWrite<VecElemU32>(gpuDynInst, offset0, offset1);
+    }
+
+    void
+    Inst_DS__DS_WRITE2ST64_B32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    }
+    // --- Inst_DS__DS_CMPST_B32 class methods ---
+
+    Inst_DS__DS_CMPST_B32::Inst_DS__DS_CMPST_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_cmpst_b32")
+    {
+    } // Inst_DS__DS_CMPST_B32
+
+    Inst_DS__DS_CMPST_B32::~Inst_DS__DS_CMPST_B32()
+    {
+    } // ~Inst_DS__DS_CMPST_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // src = DATA2;
+    // cmp = DATA;
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    // Compare and store.
+    // Caution, the order of src and cmp are the *opposite* of the
+    // ---  BUFFER_ATOMIC_CMPSWAP opcode.
+    void
+    Inst_DS__DS_CMPST_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_CMPST_F32 class methods ---
+
+    Inst_DS__DS_CMPST_F32::Inst_DS__DS_CMPST_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_cmpst_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_CMPST_F32
+
+    Inst_DS__DS_CMPST_F32::~Inst_DS__DS_CMPST_F32()
+    {
+    } // ~Inst_DS__DS_CMPST_F32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // src = DATA2;
+    // cmp = DATA;
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    // Floating point compare and store that handles NaN/INF/denormal values.
+    // Caution, the order of src and cmp are the *opposite* of the
+    // ---  BUFFER_ATOMIC_FCMPSWAP opcode.
+    void
+    Inst_DS__DS_CMPST_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_F32 class methods ---
+
+    Inst_DS__DS_MIN_F32::Inst_DS__DS_MIN_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_MIN_F32
+
+    Inst_DS__DS_MIN_F32::~Inst_DS__DS_MIN_F32()
+    {
+    } // ~Inst_DS__DS_MIN_F32
+
+    // --- description from .arch file ---
+    // 32b.
+    // tmp = MEM[ADDR];
+    // src = DATA;
+    // cmp = DATA2;
+    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
+    // Floating point minimum that handles NaN/INF/denormal values.
+    // Note that this opcode is slightly more general-purpose than
+    // ---  BUFFER_ATOMIC_FMIN.
+    void
+    Inst_DS__DS_MIN_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_F32 class methods ---
+
+    Inst_DS__DS_MAX_F32::Inst_DS__DS_MAX_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_MAX_F32
+
+    Inst_DS__DS_MAX_F32::~Inst_DS__DS_MAX_F32()
+    {
+    } // ~Inst_DS__DS_MAX_F32
+
+    // --- description from .arch file ---
+    // 32b.
+    // tmp = MEM[ADDR];
+    // src = DATA;
+    // cmp = DATA2;
+    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
+    // Floating point maximum that handles NaN/INF/denormal values.
+    // Note that this opcode is slightly more general-purpose than
+    // ---  BUFFER_ATOMIC_FMAX.
+    void
+    Inst_DS__DS_MAX_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_NOP class methods ---
+
+    Inst_DS__DS_NOP::Inst_DS__DS_NOP(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_nop")
+    {
+        setFlag(Nop);
+    } // Inst_DS__DS_NOP
+
+    Inst_DS__DS_NOP::~Inst_DS__DS_NOP()
+    {
+    } // ~Inst_DS__DS_NOP
+
+    // --- description from .arch file ---
+    // Do nothing.
+    void
+    Inst_DS__DS_NOP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        gpuDynInst->wavefront()->decLGKMInstsIssued();
+    } // execute
+    // --- Inst_DS__DS_ADD_F32 class methods ---
+
+    Inst_DS__DS_ADD_F32::Inst_DS__DS_ADD_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_add_f32")
+    {
+        setFlag(F32);
+        setFlag(MemoryRef);
+        setFlag(GroupSegment);
+        setFlag(AtomicAdd);
+        setFlag(AtomicNoReturn);
+    } // Inst_DS__DS_ADD_F32
+
+    Inst_DS__DS_ADD_F32::~Inst_DS__DS_ADD_F32()
+    {
+    } // ~Inst_DS__DS_ADD_F32
+
+    // --- description from .arch file ---
+    // 32b:
+    // MEM[ADDR] += DATA;
+    // Floating point add that handles NaN/INF/denormal values.
+    void
+    Inst_DS__DS_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandF32 data(gpuDynInst, extData.DATA0);
+
+        addr.read();
+        data.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemF32*>(gpuDynInst->a_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_ADD_F32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initAtomicAccess<VecElemF32>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_ADD_F32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_WRITE_B8 class methods ---
+
+    Inst_DS__DS_WRITE_B8::Inst_DS__DS_WRITE_B8(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write_b8")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE_B8
+
+    Inst_DS__DS_WRITE_B8::~Inst_DS__DS_WRITE_B8()
+    {
+    } // ~Inst_DS__DS_WRITE_B8
+
+    // --- description from .arch file ---
+    // MEM[ADDR] = DATA[7:0].
+    // Byte write.
+    void
+    Inst_DS__DS_WRITE_B8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU8 data(gpuDynInst, extData.DATA0);
+
+        addr.read();
+        data.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU8*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE_B8::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemWrite<VecElemU8>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_WRITE_B8::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_WRITE_B8_D16_HI class methods ---
+
+    Inst_DS__DS_WRITE_B8_D16_HI::Inst_DS__DS_WRITE_B8_D16_HI(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write_b8_d16_hi")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE_B8_D16_HI
+
+    Inst_DS__DS_WRITE_B8_D16_HI::~Inst_DS__DS_WRITE_B8_D16_HI()
+    {
+    } // ~Inst_DS__DS_WRITE_B8_D16_HI
+
+    // --- description from .arch file ---
+    // MEM[ADDR] = DATA[23:16].
+    // Byte write in to high word.
+    void
+    Inst_DS__DS_WRITE_B8_D16_HI::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU8 data(gpuDynInst, extData.DATA0);
+
+        addr.read();
+        data.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU8*>(gpuDynInst->d_data))[lane]
+                    = bits(data[lane], 23, 16);
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE_B8_D16_HI::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemWrite<VecElemU8>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_WRITE_B8_D16_HI::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_WRITE_B16 class methods ---
+
+    Inst_DS__DS_WRITE_B16::Inst_DS__DS_WRITE_B16(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write_b16")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE_B16
+
+    Inst_DS__DS_WRITE_B16::~Inst_DS__DS_WRITE_B16()
+    {
+    } // ~Inst_DS__DS_WRITE_B16
+
+    // --- description from .arch file ---
+    // MEM[ADDR] = DATA[15:0]
+    // Short write.
+    void
+    Inst_DS__DS_WRITE_B16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU16 data(gpuDynInst, extData.DATA0);
+
+        addr.read();
+        data.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU16*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE_B16::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemWrite<VecElemU16>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_WRITE_B16::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_ADD_RTN_U32 class methods ---
+
+    Inst_DS__DS_ADD_RTN_U32::Inst_DS__DS_ADD_RTN_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_add_rtn_u32")
+    {
+    } // Inst_DS__DS_ADD_RTN_U32
+
+    Inst_DS__DS_ADD_RTN_U32::~Inst_DS__DS_ADD_RTN_U32()
+    {
+    } // ~Inst_DS__DS_ADD_RTN_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] += DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_ADD_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_SUB_RTN_U32 class methods ---
+
+    Inst_DS__DS_SUB_RTN_U32::Inst_DS__DS_SUB_RTN_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_sub_rtn_u32")
+    {
+    } // Inst_DS__DS_SUB_RTN_U32
+
+    Inst_DS__DS_SUB_RTN_U32::~Inst_DS__DS_SUB_RTN_U32()
+    {
+    } // ~Inst_DS__DS_SUB_RTN_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_SUB_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_RSUB_RTN_U32 class methods ---
+
+    Inst_DS__DS_RSUB_RTN_U32::Inst_DS__DS_RSUB_RTN_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_rsub_rtn_u32")
+    {
+    } // Inst_DS__DS_RSUB_RTN_U32
+
+    Inst_DS__DS_RSUB_RTN_U32::~Inst_DS__DS_RSUB_RTN_U32()
+    {
+    } // ~Inst_DS__DS_RSUB_RTN_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA - MEM[ADDR];
+    // RETURN_DATA = tmp.
+    // Subtraction with reversed operands.
+    void
+    Inst_DS__DS_RSUB_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_INC_RTN_U32 class methods ---
+
+    Inst_DS__DS_INC_RTN_U32::Inst_DS__DS_INC_RTN_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_inc_rtn_u32")
+    {
+    } // Inst_DS__DS_INC_RTN_U32
+
+    Inst_DS__DS_INC_RTN_U32::~Inst_DS__DS_INC_RTN_U32()
+    {
+    } // ~Inst_DS__DS_INC_RTN_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_INC_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_DEC_RTN_U32 class methods ---
+
+    Inst_DS__DS_DEC_RTN_U32::Inst_DS__DS_DEC_RTN_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_dec_rtn_u32")
+    {
+    } // Inst_DS__DS_DEC_RTN_U32
+
+    Inst_DS__DS_DEC_RTN_U32::~Inst_DS__DS_DEC_RTN_U32()
+    {
+    } // ~Inst_DS__DS_DEC_RTN_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
+    // (unsigned compare); RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_DEC_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_RTN_I32 class methods ---
+
+    Inst_DS__DS_MIN_RTN_I32::Inst_DS__DS_MIN_RTN_I32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_rtn_i32")
+    {
+    } // Inst_DS__DS_MIN_RTN_I32
+
+    Inst_DS__DS_MIN_RTN_I32::~Inst_DS__DS_MIN_RTN_I32()
+    {
+    } // ~Inst_DS__DS_MIN_RTN_I32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_MIN_RTN_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_RTN_I32 class methods ---
+
+    Inst_DS__DS_MAX_RTN_I32::Inst_DS__DS_MAX_RTN_I32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_rtn_i32")
+    {
+    } // Inst_DS__DS_MAX_RTN_I32
+
+    Inst_DS__DS_MAX_RTN_I32::~Inst_DS__DS_MAX_RTN_I32()
+    {
+    } // ~Inst_DS__DS_MAX_RTN_I32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_MAX_RTN_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_RTN_U32 class methods ---
+
+    Inst_DS__DS_MIN_RTN_U32::Inst_DS__DS_MIN_RTN_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_rtn_u32")
+    {
+    } // Inst_DS__DS_MIN_RTN_U32
+
+    Inst_DS__DS_MIN_RTN_U32::~Inst_DS__DS_MIN_RTN_U32()
+    {
+    } // ~Inst_DS__DS_MIN_RTN_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_MIN_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_RTN_U32 class methods ---
+
+    Inst_DS__DS_MAX_RTN_U32::Inst_DS__DS_MAX_RTN_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_rtn_u32")
+    {
+    } // Inst_DS__DS_MAX_RTN_U32
+
+    Inst_DS__DS_MAX_RTN_U32::~Inst_DS__DS_MAX_RTN_U32()
+    {
+    } // ~Inst_DS__DS_MAX_RTN_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_MAX_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_AND_RTN_B32 class methods ---
+
+    Inst_DS__DS_AND_RTN_B32::Inst_DS__DS_AND_RTN_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_and_rtn_b32")
+    {
+    } // Inst_DS__DS_AND_RTN_B32
+
+    Inst_DS__DS_AND_RTN_B32::~Inst_DS__DS_AND_RTN_B32()
+    {
+    } // ~Inst_DS__DS_AND_RTN_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] &= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_AND_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_OR_RTN_B32 class methods ---
+
+    Inst_DS__DS_OR_RTN_B32::Inst_DS__DS_OR_RTN_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_or_rtn_b32")
+    {
+    } // Inst_DS__DS_OR_RTN_B32
+
+    Inst_DS__DS_OR_RTN_B32::~Inst_DS__DS_OR_RTN_B32()
+    {
+    } // ~Inst_DS__DS_OR_RTN_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] |= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_OR_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_XOR_RTN_B32 class methods ---
+
+    Inst_DS__DS_XOR_RTN_B32::Inst_DS__DS_XOR_RTN_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_xor_rtn_b32")
+    {
+    } // Inst_DS__DS_XOR_RTN_B32
+
+    Inst_DS__DS_XOR_RTN_B32::~Inst_DS__DS_XOR_RTN_B32()
+    {
+    } // ~Inst_DS__DS_XOR_RTN_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] ^= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_XOR_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MSKOR_RTN_B32 class methods ---
+
+    Inst_DS__DS_MSKOR_RTN_B32::Inst_DS__DS_MSKOR_RTN_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_mskor_rtn_b32")
+    {
+    } // Inst_DS__DS_MSKOR_RTN_B32
+
+    Inst_DS__DS_MSKOR_RTN_B32::~Inst_DS__DS_MSKOR_RTN_B32()
+    {
+    } // ~Inst_DS__DS_MSKOR_RTN_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
+    // RETURN_DATA = tmp.
+    // Masked dword OR, D0 contains the mask and D1 contains the new value.
+    void
+    Inst_DS__DS_MSKOR_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRXCHG_RTN_B32 class methods ---
+
+    Inst_DS__DS_WRXCHG_RTN_B32::Inst_DS__DS_WRXCHG_RTN_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_wrxchg_rtn_b32")
+    {
+    } // Inst_DS__DS_WRXCHG_RTN_B32
+
+    Inst_DS__DS_WRXCHG_RTN_B32::~Inst_DS__DS_WRXCHG_RTN_B32()
+    {
+    } // ~Inst_DS__DS_WRXCHG_RTN_B32
+
+    // --- description from .arch file ---
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA;
+    // RETURN_DATA = tmp.
+    // Write-exchange operation.
+    void
+    Inst_DS__DS_WRXCHG_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRXCHG2_RTN_B32 class methods ---
+
+    Inst_DS__DS_WRXCHG2_RTN_B32::Inst_DS__DS_WRXCHG2_RTN_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_wrxchg2_rtn_b32")
+    {
+    } // Inst_DS__DS_WRXCHG2_RTN_B32
+
+    Inst_DS__DS_WRXCHG2_RTN_B32::~Inst_DS__DS_WRXCHG2_RTN_B32()
+    {
+    } // ~Inst_DS__DS_WRXCHG2_RTN_B32
+
+    // --- description from .arch file ---
+    // Write-exchange 2 separate dwords.
+    void
+    Inst_DS__DS_WRXCHG2_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRXCHG2ST64_RTN_B32 class methods ---
+
+    Inst_DS__DS_WRXCHG2ST64_RTN_B32::Inst_DS__DS_WRXCHG2ST64_RTN_B32(
+          InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_wrxchg2st64_rtn_b32")
+    {
+    } // Inst_DS__DS_WRXCHG2ST64_RTN_B32
+
+    Inst_DS__DS_WRXCHG2ST64_RTN_B32::~Inst_DS__DS_WRXCHG2ST64_RTN_B32()
+    {
+    } // ~Inst_DS__DS_WRXCHG2ST64_RTN_B32
+
+    // --- description from .arch file ---
+    // Write-exchange 2 separate dwords with a stride of 64 dwords.
+    void
+    Inst_DS__DS_WRXCHG2ST64_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_CMPST_RTN_B32 class methods ---
+
+    Inst_DS__DS_CMPST_RTN_B32::Inst_DS__DS_CMPST_RTN_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_cmpst_rtn_b32")
+    {
+    } // Inst_DS__DS_CMPST_RTN_B32
+
+    Inst_DS__DS_CMPST_RTN_B32::~Inst_DS__DS_CMPST_RTN_B32()
+    {
+    } // ~Inst_DS__DS_CMPST_RTN_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // src = DATA2;
+    // cmp = DATA;
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    // Compare and store.
+    // Caution, the order of src and cmp are the *opposite* of the
+    // ---  BUFFER_ATOMIC_CMPSWAP opcode.
+    void
+    Inst_DS__DS_CMPST_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_CMPST_RTN_F32 class methods ---
+
+    Inst_DS__DS_CMPST_RTN_F32::Inst_DS__DS_CMPST_RTN_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_cmpst_rtn_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_CMPST_RTN_F32
+
+    Inst_DS__DS_CMPST_RTN_F32::~Inst_DS__DS_CMPST_RTN_F32()
+    {
+    } // ~Inst_DS__DS_CMPST_RTN_F32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // src = DATA2;
+    // cmp = DATA;
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    // Floating point compare and store that handles NaN/INF/denormal values.
+    // Caution, the order of src and cmp are the *opposite* of the
+    // ---  BUFFER_ATOMIC_FCMPSWAP opcode.
+    void
+    Inst_DS__DS_CMPST_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_RTN_F32 class methods ---
+
+    Inst_DS__DS_MIN_RTN_F32::Inst_DS__DS_MIN_RTN_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_rtn_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_MIN_RTN_F32
+
+    Inst_DS__DS_MIN_RTN_F32::~Inst_DS__DS_MIN_RTN_F32()
+    {
+    } // ~Inst_DS__DS_MIN_RTN_F32
+
+    // --- description from .arch file ---
+    // 32b.
+    // tmp = MEM[ADDR];
+    // src = DATA;
+    // cmp = DATA2;
+    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
+    // Floating point minimum that handles NaN/INF/denormal values.
+    // Note that this opcode is slightly more general-purpose than
+    // ---  BUFFER_ATOMIC_FMIN.
+    void
+    Inst_DS__DS_MIN_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_RTN_F32 class methods ---
+
+    Inst_DS__DS_MAX_RTN_F32::Inst_DS__DS_MAX_RTN_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_rtn_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_MAX_RTN_F32
+
+    Inst_DS__DS_MAX_RTN_F32::~Inst_DS__DS_MAX_RTN_F32()
+    {
+    } // ~Inst_DS__DS_MAX_RTN_F32
+
+    // --- description from .arch file ---
+    // 32b.
+    // tmp = MEM[ADDR];
+    // src = DATA;
+    // cmp = DATA2;
+    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
+    // Floating point maximum that handles NaN/INF/denormal values.
+    // Note that this opcode is slightly more general-purpose than
+    // ---  BUFFER_ATOMIC_FMAX.
+    void
+    Inst_DS__DS_MAX_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRAP_RTN_B32 class methods ---
+
+    Inst_DS__DS_WRAP_RTN_B32::Inst_DS__DS_WRAP_RTN_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_wrap_rtn_b32")
+    {
+    } // Inst_DS__DS_WRAP_RTN_B32
+
+    Inst_DS__DS_WRAP_RTN_B32::~Inst_DS__DS_WRAP_RTN_B32()
+    {
+    } // ~Inst_DS__DS_WRAP_RTN_B32
+
+    // --- description from .arch file ---
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA) ? tmp - DATA : tmp + DATA2;
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_WRAP_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_ADD_RTN_F32 class methods ---
+
+    Inst_DS__DS_ADD_RTN_F32::Inst_DS__DS_ADD_RTN_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_add_rtn_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_ADD_RTN_F32
+
+    Inst_DS__DS_ADD_RTN_F32::~Inst_DS__DS_ADD_RTN_F32()
+    {
+    } // ~Inst_DS__DS_ADD_RTN_F32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] += DATA;
+    // RETURN_DATA = tmp.
+    // Floating point add that handles NaN/INF/denormal values.
+    void
+    Inst_DS__DS_ADD_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_READ_B32 class methods ---
+
+    Inst_DS__DS_READ_B32::Inst_DS__DS_READ_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read_b32")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ_B32
+
+    Inst_DS__DS_READ_B32::~Inst_DS__DS_READ_B32()
+    {
+    } // ~Inst_DS__DS_READ_B32
+
+    // --- description from .arch file ---
+    // RETURN_DATA = MEM[ADDR].
+    // Dword read.
+    void
+    Inst_DS__DS_READ_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemRead<VecElemU32>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ_B32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane];
+            }
+        }
+
+        vdst.write();
+    } // completeAcc
+    // --- Inst_DS__DS_READ2_B32 class methods ---
+
+    Inst_DS__DS_READ2_B32::Inst_DS__DS_READ2_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read2_b32")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ2_B32
+
+    Inst_DS__DS_READ2_B32::~Inst_DS__DS_READ2_B32()
+    {
+    } // ~Inst_DS__DS_READ2_B32
+
+    // --- description from .arch file ---
+    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 4];
+    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 4].
+    // Read 2 dwords.
+    void
+    Inst_DS__DS_READ2_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ2_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0 * 4;
+        Addr offset1 = instData.OFFSET1 * 4;
+
+        initDualMemRead<VecElemU32>(gpuDynInst, offset0, offset1);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ2_B32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 2];
+                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 2 + 1];
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+    } // completeAcc
+    // --- Inst_DS__DS_READ2ST64_B32 class methods ---
+
+    Inst_DS__DS_READ2ST64_B32::Inst_DS__DS_READ2ST64_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read2st64_b32")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ2ST64_B32
+
+    Inst_DS__DS_READ2ST64_B32::~Inst_DS__DS_READ2ST64_B32()
+    {
+    } // ~Inst_DS__DS_READ2ST64_B32
+
+    // --- description from .arch file ---
+    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 4 * 64];
+    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 4 * 64].
+    // Read 2 dwords.
+    void
+    Inst_DS__DS_READ2ST64_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ2ST64_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = (instData.OFFSET0 * 4 * 64);
+        Addr offset1 = (instData.OFFSET1 * 4 * 64);
+
+        initDualMemRead<VecElemU32>(gpuDynInst, offset0, offset1);
+    }
+
+    void
+    Inst_DS__DS_READ2ST64_B32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 2];
+                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 2 + 1];
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+    }
+    // --- Inst_DS__DS_READ_I8 class methods ---
+
+    Inst_DS__DS_READ_I8::Inst_DS__DS_READ_I8(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read_i8")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ_I8
+
+    Inst_DS__DS_READ_I8::~Inst_DS__DS_READ_I8()
+    {
+    } // ~Inst_DS__DS_READ_I8
+
+    // --- description from .arch file ---
+    // RETURN_DATA = signext(MEM[ADDR][7:0]).
+    // Signed byte read.
+    void
+    Inst_DS__DS_READ_I8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ_I8::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemRead<VecElemI8>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ_I8::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst[lane] = (VecElemU32)sext<8>((reinterpret_cast<VecElemI8*>(
+                    gpuDynInst->d_data))[lane]);
+            }
+        }
+
+        vdst.write();
+    } // completeAcc
+    // --- Inst_DS__DS_READ_U8 class methods ---
+
+    Inst_DS__DS_READ_U8::Inst_DS__DS_READ_U8(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read_u8")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ_U8
+
+    Inst_DS__DS_READ_U8::~Inst_DS__DS_READ_U8()
+    {
+    } // ~Inst_DS__DS_READ_U8
+
+    // --- description from .arch file ---
+    // RETURN_DATA = {24'h0,MEM[ADDR][7:0]}.
+    // Unsigned byte read.
+    void
+    Inst_DS__DS_READ_U8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ_U8::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemRead<VecElemU8>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ_U8::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst[lane] = (VecElemU32)(reinterpret_cast<VecElemU8*>(
+                    gpuDynInst->d_data))[lane];
+            }
+        }
+
+        vdst.write();
+    } // completeAcc
+    // --- Inst_DS__DS_READ_I16 class methods ---
+
+    Inst_DS__DS_READ_I16::Inst_DS__DS_READ_I16(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read_i16")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ_I16
+
+    Inst_DS__DS_READ_I16::~Inst_DS__DS_READ_I16()
+    {
+    } // ~Inst_DS__DS_READ_I16
+
+    // --- description from .arch file ---
+    // RETURN_DATA = signext(MEM[ADDR][15:0]).
+    // Signed short read.
+    void
+    Inst_DS__DS_READ_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_READ_U16 class methods ---
+
+    Inst_DS__DS_READ_U16::Inst_DS__DS_READ_U16(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read_u16")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ_U16
+
+    Inst_DS__DS_READ_U16::~Inst_DS__DS_READ_U16()
+    {
+    } // ~Inst_DS__DS_READ_U16
+
+    // --- description from .arch file ---
+    // RETURN_DATA = {16'h0,MEM[ADDR][15:0]}.
+    // Unsigned short read.
+    void
+    Inst_DS__DS_READ_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+    void
+    Inst_DS__DS_READ_U16::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemRead<VecElemU16>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ_U16::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst[lane] = (VecElemU32)(reinterpret_cast<VecElemU16*>(
+                    gpuDynInst->d_data))[lane];
+            }
+        }
+
+        vdst.write();
+    } // completeAcc
+    // --- Inst_DS__DS_SWIZZLE_B32 class methods ---
+
+    Inst_DS__DS_SWIZZLE_B32::Inst_DS__DS_SWIZZLE_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_swizzle_b32")
+    {
+        /**
+         * While this operation doesn't actually use DS storage we classify
+         * it as a load here because it does a writeback to a VGPR, which
+         * fits in better with the LDS pipeline logic.
+         */
+         setFlag(Load);
+    } // Inst_DS__DS_SWIZZLE_B32
+
+    Inst_DS__DS_SWIZZLE_B32::~Inst_DS__DS_SWIZZLE_B32()
+    {
+    } // ~Inst_DS__DS_SWIZZLE_B32
+
+    // --- description from .arch file ---
+    // RETURN_DATA = swizzle(vgpr_data, offset1:offset0).
+    // Dword swizzle, no data is written to LDS memory; See ds_opcodes.docx for
+    // ---  details.
+    void
+    Inst_DS__DS_SWIZZLE_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        wf->decLGKMInstsIssued();
+
+        if (gpuDynInst->exec_mask.none()) {
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()
+                                ->cyclesToTicks(Cycles(24)));
+
+        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+        /**
+         * The "DS pattern" is comprised of both offset fields. That is, the
+         * swizzle pattern between lanes. Bit 15 of the DS pattern dictates
+         * which swizzle mode to use. There are two different swizzle
+         * patterns: 1) QDMode and 2) Bit-masks mode. If bit 15 is set use
+         * QDMode else use Bit-masks mode. The remaining bits dictate how to
+         * swizzle the lanes.
+         *
+         * QDMode:      Chunks the lanes into 4s and swizzles among them.
+         *              Bits 7:6 dictate where lane 3 (of the current chunk)
+         *              gets its date, 5:4 lane 2, etc.
+         *
+         * Bit-mask:    This mode breaks bits 14:0 into 3 equal-sized chunks.
+         *              14:10 is the xor_mask, 9:5 is the or_mask, and 4:0
+         *              is the and_mask. Each lane is swizzled by performing
+         *              the appropriate operation using these masks.
+         */
+        VecElemU16 ds_pattern = ((instData.OFFSET1 << 8) | instData.OFFSET0);
+
+        data.read();
+
+        if (bits(ds_pattern, 15)) {
+            // QDMode
+            for (int lane = 0; lane < NumVecElemPerVecReg; lane += 4) {
+                /**
+                 * This operation allows data sharing between groups
+                 * of four consecutive threads. Note the increment by
+                 * 4 in the for loop.
+                 */
+                if (gpuDynInst->exec_mask[lane]) {
+                    int index0 = lane + bits(ds_pattern, 1, 0);
+                    panic_if(index0 >= NumVecElemPerVecReg, "%s: index0 (%d) "
+                             "is out of bounds.\n", gpuDynInst->disassemble(),
+                             index0);
+                    vdst[lane]
+                        = gpuDynInst->exec_mask[index0] ? data[index0]: 0;
+                }
+                if (gpuDynInst->exec_mask[lane + 1]) {
+                    int index1 = lane + bits(ds_pattern, 3, 2);
+                    panic_if(index1 >= NumVecElemPerVecReg, "%s: index1 (%d) "
+                             "is out of bounds.\n", gpuDynInst->disassemble(),
+                             index1);
+                    vdst[lane + 1]
+                        = gpuDynInst->exec_mask[index1] ? data[index1]: 0;
+                }
+                if (gpuDynInst->exec_mask[lane + 2]) {
+                    int index2 = lane + bits(ds_pattern, 5, 4);
+                    panic_if(index2 >= NumVecElemPerVecReg, "%s: index2 (%d) "
+                             "is out of bounds.\n", gpuDynInst->disassemble(),
+                             index2);
+                    vdst[lane + 2]
+                        = gpuDynInst->exec_mask[index2] ? data[index2]: 0;
+                }
+                if (gpuDynInst->exec_mask[lane + 3]) {
+                    int index3 = lane + bits(ds_pattern, 7, 6);
+                    panic_if(index3 >= NumVecElemPerVecReg, "%s: index3 (%d) "
+                             "is out of bounds.\n", gpuDynInst->disassemble(),
+                             index3);
+                    vdst[lane + 3]
+                        = gpuDynInst->exec_mask[index3] ? data[index3]: 0;
+                }
+            }
+        } else {
+            // Bit Mode
+            int and_mask = bits(ds_pattern, 4, 0);
+            int or_mask = bits(ds_pattern, 9, 5);
+            int xor_mask = bits(ds_pattern, 14, 10);
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (gpuDynInst->exec_mask[lane]) {
+                    int index = (((lane & and_mask) | or_mask) ^ xor_mask);
+                    // Adjust for the next 32 lanes.
+                    if (lane > 31) {
+                        index += 32;
+                    }
+                    panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is "
+                             "out of bounds.\n", gpuDynInst->disassemble(),
+                             index);
+                    vdst[lane]
+                        = gpuDynInst->exec_mask[index] ? data[index] : 0;
+                }
+            }
+        }
+
+        vdst.write();
+
+        /**
+         * This is needed because we treat this instruction as a load
+         * but it's not an actual memory request.
+         * Without this, the destination register never gets marked as
+         * free, leading to a  possible deadlock
+         */
+        wf->computeUnit->vrf[wf->simdId]->
+            scheduleWriteOperandsFromLoad(wf, gpuDynInst);
+        /**
+         * Similarly, this counter could build up over time, even across
+         * multiple wavefronts, and cause a deadlock.
+         */
+        wf->rdLmReqsInPipe--;
+    } // execute
+    // --- Inst_DS__DS_PERMUTE_B32 class methods ---
+
+    Inst_DS__DS_PERMUTE_B32::Inst_DS__DS_PERMUTE_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_permute_b32")
+    {
+        setFlag(MemoryRef);
+        /**
+         * While this operation doesn't actually use DS storage we classify
+         * it as a load here because it does a writeback to a VGPR, which
+         * fits in better with the LDS pipeline logic.
+         */
+         setFlag(Load);
+    } // Inst_DS__DS_PERMUTE_B32
+
+    Inst_DS__DS_PERMUTE_B32::~Inst_DS__DS_PERMUTE_B32()
+    {
+    } // ~Inst_DS__DS_PERMUTE_B32
+
+    // --- description from .arch file ---
+    // Forward permute.
+    void
+    Inst_DS__DS_PERMUTE_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        wf->decLGKMInstsIssued();
+
+        if (gpuDynInst->exec_mask.none()) {
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()
+                                ->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        addr.read();
+        data.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                /**
+                 * One of the offset fields can be used for the index.
+                 * It is assumed OFFSET0 would be used, as OFFSET1 is
+                 * typically only used for DS ops that operate on two
+                 * disparate pieces of data.
+                 */
+                assert(!instData.OFFSET1);
+                /**
+                 * The address provided is a byte address, but VGPRs are
+                 * 4 bytes, so we must divide by 4 to get the actual VGPR
+                 * index. Additionally, the index is calculated modulo the
+                 * WF size, 64 in this case, so we simply extract bits 7-2.
+                 */
+                int index = bits(addr[lane] + instData.OFFSET0, 7, 2);
+                panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is out "
+                         "of bounds.\n", gpuDynInst->disassemble(), index);
+                /**
+                 * If the shuffled index corresponds to a lane that is
+                 * inactive then this instruction writes a 0 to the active
+                 * lane in VDST.
+                 */
+                if (wf->execMask(index)) {
+                    vdst[index] = data[lane];
+                } else {
+                    vdst[index] = 0;
+                }
+            }
+        }
+
+        vdst.write();
+
+        /**
+         * This is needed because we treat this instruction as a load
+         * but it's not an actual memory request.
+         * Without this, the destination register never gets marked as
+         * free, leading to a  possible deadlock
+         */
+        wf->computeUnit->vrf[wf->simdId]->
+            scheduleWriteOperandsFromLoad(wf, gpuDynInst);
+        /**
+         * Similarly, this counter could build up over time, even across
+         * multiple wavefronts, and cause a deadlock.
+         */
+        wf->rdLmReqsInPipe--;
+    } // execute
+    // --- Inst_DS__DS_BPERMUTE_B32 class methods ---
+
+    Inst_DS__DS_BPERMUTE_B32::Inst_DS__DS_BPERMUTE_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_bpermute_b32")
+    {
+        setFlag(MemoryRef);
+        /**
+         * While this operation doesn't actually use DS storage we classify
+         * it as a load here because it does a writeback to a VGPR, which
+         * fits in better with the LDS pipeline logic.
+         */
+        setFlag(Load);
+    } // Inst_DS__DS_BPERMUTE_B32
+
+    Inst_DS__DS_BPERMUTE_B32::~Inst_DS__DS_BPERMUTE_B32()
+    {
+    } // ~Inst_DS__DS_BPERMUTE_B32
+
+    // --- description from .arch file ---
+    // Backward permute.
+    void
+    Inst_DS__DS_BPERMUTE_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        wf->decLGKMInstsIssued();
+
+        if (gpuDynInst->exec_mask.none()) {
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()
+                                ->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        addr.read();
+        data.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                /**
+                 * One of the offset fields can be used for the index.
+                 * It is assumed OFFSET0 would be used, as OFFSET1 is
+                 * typically only used for DS ops that operate on two
+                 * disparate pieces of data.
+                 */
+                assert(!instData.OFFSET1);
+                /**
+                 * The address provided is a byte address, but VGPRs are
+                 * 4 bytes, so we must divide by 4 to get the actual VGPR
+                 * index. Additionally, the index is calculated modulo the
+                 * WF size, 64 in this case, so we simply extract bits 7-2.
+                 */
+                int index = bits(addr[lane] + instData.OFFSET0, 7, 2);
+                panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is out "
+                         "of bounds.\n", gpuDynInst->disassemble(), index);
+                /**
+                 * If the shuffled index corresponds to a lane that is
+                 * inactive then this instruction writes a 0 to the active
+                 * lane in VDST.
+                 */
+                if (wf->execMask(index)) {
+                    vdst[lane] = data[index];
+                } else {
+                    vdst[lane] = 0;
+                }
+            }
+        }
+
+        vdst.write();
+
+        /**
+         * This is needed because we treat this instruction as a load
+         * but it's not an actual memory request.
+         * Without this, the destination register never gets marked as
+         * free, leading to a  possible deadlock
+         */
+        wf->computeUnit->vrf[wf->simdId]->
+            scheduleWriteOperandsFromLoad(wf, gpuDynInst);
+        /**
+         * Similarly, this counter could build up over time, even across
+         * multiple wavefronts, and cause a deadlock.
+         */
+        wf->rdLmReqsInPipe--;
+    } // execute
+
+    // --- Inst_DS__DS_ADD_U64 class methods ---
+
+    Inst_DS__DS_ADD_U64::Inst_DS__DS_ADD_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_add_u64")
+    {
+        setFlag(MemoryRef);
+        setFlag(GroupSegment);
+        setFlag(AtomicAdd);
+        setFlag(AtomicNoReturn);
+    } // Inst_DS__DS_ADD_U64
+
+    Inst_DS__DS_ADD_U64::~Inst_DS__DS_ADD_U64()
+    {
+    } // ~Inst_DS__DS_ADD_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // MEM[ADDR] += DATA[0:1];
+    void
+    Inst_DS__DS_ADD_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU64 data(gpuDynInst, extData.DATA0);
+
+        addr.read();
+        data.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU64*>(gpuDynInst->a_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_ADD_U64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initAtomicAccess<VecElemU64>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_ADD_U64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_SUB_U64 class methods ---
+
+    Inst_DS__DS_SUB_U64::Inst_DS__DS_SUB_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_sub_u64")
+    {
+    } // Inst_DS__DS_SUB_U64
+
+    Inst_DS__DS_SUB_U64::~Inst_DS__DS_SUB_U64()
+    {
+    } // ~Inst_DS__DS_SUB_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_SUB_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_RSUB_U64 class methods ---
+
+    Inst_DS__DS_RSUB_U64::Inst_DS__DS_RSUB_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_rsub_u64")
+    {
+    } // Inst_DS__DS_RSUB_U64
+
+    Inst_DS__DS_RSUB_U64::~Inst_DS__DS_RSUB_U64()
+    {
+    } // ~Inst_DS__DS_RSUB_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA - MEM[ADDR];
+    // RETURN_DATA = tmp.
+    // Subtraction with reversed operands.
+    void
+    Inst_DS__DS_RSUB_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_INC_U64 class methods ---
+
+    Inst_DS__DS_INC_U64::Inst_DS__DS_INC_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_inc_u64")
+    {
+    } // Inst_DS__DS_INC_U64
+
+    Inst_DS__DS_INC_U64::~Inst_DS__DS_INC_U64()
+    {
+    } // ~Inst_DS__DS_INC_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_INC_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_DEC_U64 class methods ---
+
+    Inst_DS__DS_DEC_U64::Inst_DS__DS_DEC_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_dec_u64")
+    {
+    } // Inst_DS__DS_DEC_U64
+
+    Inst_DS__DS_DEC_U64::~Inst_DS__DS_DEC_U64()
+    {
+    } // ~Inst_DS__DS_DEC_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
+    // (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_DEC_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_I64 class methods ---
+
+    Inst_DS__DS_MIN_I64::Inst_DS__DS_MIN_I64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_i64")
+    {
+    } // Inst_DS__DS_MIN_I64
+
+    Inst_DS__DS_MIN_I64::~Inst_DS__DS_MIN_I64()
+    {
+    } // ~Inst_DS__DS_MIN_I64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_MIN_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_I64 class methods ---
+
+    Inst_DS__DS_MAX_I64::Inst_DS__DS_MAX_I64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_i64")
+    {
+    } // Inst_DS__DS_MAX_I64
+
+    Inst_DS__DS_MAX_I64::~Inst_DS__DS_MAX_I64()
+    {
+    } // ~Inst_DS__DS_MAX_I64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_MAX_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_U64 class methods ---
+
+    Inst_DS__DS_MIN_U64::Inst_DS__DS_MIN_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_u64")
+    {
+    } // Inst_DS__DS_MIN_U64
+
+    Inst_DS__DS_MIN_U64::~Inst_DS__DS_MIN_U64()
+    {
+    } // ~Inst_DS__DS_MIN_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_MIN_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_U64 class methods ---
+
+    Inst_DS__DS_MAX_U64::Inst_DS__DS_MAX_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_u64")
+    {
+    } // Inst_DS__DS_MAX_U64
+
+    Inst_DS__DS_MAX_U64::~Inst_DS__DS_MAX_U64()
+    {
+    } // ~Inst_DS__DS_MAX_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_MAX_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_AND_B64 class methods ---
+
+    Inst_DS__DS_AND_B64::Inst_DS__DS_AND_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_and_b64")
+    {
+    } // Inst_DS__DS_AND_B64
+
+    Inst_DS__DS_AND_B64::~Inst_DS__DS_AND_B64()
+    {
+    } // ~Inst_DS__DS_AND_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] &= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_AND_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_OR_B64 class methods ---
+
+    Inst_DS__DS_OR_B64::Inst_DS__DS_OR_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_or_b64")
+    {
+    } // Inst_DS__DS_OR_B64
+
+    Inst_DS__DS_OR_B64::~Inst_DS__DS_OR_B64()
+    {
+    } // ~Inst_DS__DS_OR_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] |= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_OR_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_XOR_B64 class methods ---
+
+    Inst_DS__DS_XOR_B64::Inst_DS__DS_XOR_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_xor_b64")
+    {
+    } // Inst_DS__DS_XOR_B64
+
+    Inst_DS__DS_XOR_B64::~Inst_DS__DS_XOR_B64()
+    {
+    } // ~Inst_DS__DS_XOR_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] ^= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_XOR_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MSKOR_B64 class methods ---
+
+    Inst_DS__DS_MSKOR_B64::Inst_DS__DS_MSKOR_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_mskor_b64")
+    {
+    } // Inst_DS__DS_MSKOR_B64
+
+    Inst_DS__DS_MSKOR_B64::~Inst_DS__DS_MSKOR_B64()
+    {
+    } // ~Inst_DS__DS_MSKOR_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
+    // RETURN_DATA = tmp.
+    // Masked dword OR, D0 contains the mask and D1 contains the new value.
+    void
+    Inst_DS__DS_MSKOR_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRITE_B64 class methods ---
+
+    Inst_DS__DS_WRITE_B64::Inst_DS__DS_WRITE_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write_b64")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE_B64
+
+    Inst_DS__DS_WRITE_B64::~Inst_DS__DS_WRITE_B64()
+    {
+    } // ~Inst_DS__DS_WRITE_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // MEM[ADDR] = DATA.
+    // Write qword.
+    void
+    Inst_DS__DS_WRITE_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU64 data(gpuDynInst, extData.DATA0);
+
+        addr.read();
+        data.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU64*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemWrite<VecElemU64>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_WRITE_B64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_WRITE2_B64 class methods ---
+
+    Inst_DS__DS_WRITE2_B64::Inst_DS__DS_WRITE2_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write2_b64")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE2_B64
+
+    Inst_DS__DS_WRITE2_B64::~Inst_DS__DS_WRITE2_B64()
+    {
+    } // ~Inst_DS__DS_WRITE2_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // MEM[ADDR_BASE + OFFSET0 * 8] = DATA;
+    // MEM[ADDR_BASE + OFFSET1 * 8] = DATA2.
+    // Write 2 qwords.
+    void
+    Inst_DS__DS_WRITE2_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU64 data0(gpuDynInst, extData.DATA0);
+        ConstVecOperandU64 data1(gpuDynInst, extData.DATA1);
+
+        addr.read();
+        data0.read();
+        data1.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane * 2] = data0[lane];
+                (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE2_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0 * 8;
+        Addr offset1 = instData.OFFSET1 * 8;
+
+        initDualMemWrite<VecElemU64>(gpuDynInst, offset0, offset1);
+    }
+
+    void
+    Inst_DS__DS_WRITE2_B64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    }
+    // --- Inst_DS__DS_WRITE2ST64_B64 class methods ---
+
+    Inst_DS__DS_WRITE2ST64_B64::Inst_DS__DS_WRITE2ST64_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write2st64_b64")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE2ST64_B64
+
+    Inst_DS__DS_WRITE2ST64_B64::~Inst_DS__DS_WRITE2ST64_B64()
+    {
+    } // ~Inst_DS__DS_WRITE2ST64_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // MEM[ADDR_BASE + OFFSET0 * 8 * 64] = DATA;
+    // MEM[ADDR_BASE + OFFSET1 * 8 * 64] = DATA2;
+    // Write 2 qwords.
+    void
+    Inst_DS__DS_WRITE2ST64_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU64 data0(gpuDynInst, extData.DATA0);
+        ConstVecOperandU64 data1(gpuDynInst, extData.DATA1);
+
+        addr.read();
+        data0.read();
+        data1.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane * 2] = data0[lane];
+                (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE2ST64_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0 * 8 * 64;
+        Addr offset1 = instData.OFFSET1 * 8 * 64;
+
+        initDualMemWrite<VecElemU64>(gpuDynInst, offset0, offset1);
+    }
+
+    void
+    Inst_DS__DS_WRITE2ST64_B64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    }
+    // --- Inst_DS__DS_CMPST_B64 class methods ---
+
+    Inst_DS__DS_CMPST_B64::Inst_DS__DS_CMPST_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_cmpst_b64")
+    {
+    } // Inst_DS__DS_CMPST_B64
+
+    Inst_DS__DS_CMPST_B64::~Inst_DS__DS_CMPST_B64()
+    {
+    } // ~Inst_DS__DS_CMPST_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // src = DATA2;
+    // cmp = DATA;
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    // Compare and store.
+    // Caution, the order of src and cmp are the *opposite* of the
+    // ---  BUFFER_ATOMIC_CMPSWAP_X2 opcode.
+    void
+    Inst_DS__DS_CMPST_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_CMPST_F64 class methods ---
+
+    Inst_DS__DS_CMPST_F64::Inst_DS__DS_CMPST_F64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_cmpst_f64")
+    {
+        setFlag(F64);
+    } // Inst_DS__DS_CMPST_F64
+
+    Inst_DS__DS_CMPST_F64::~Inst_DS__DS_CMPST_F64()
+    {
+    } // ~Inst_DS__DS_CMPST_F64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // src = DATA2;
+    // cmp = DATA;
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    // Floating point compare and store that handles NaN/INF/denormal values.
+    // Caution, the order of src and cmp are the *opposite* of the
+    // ---  BUFFER_ATOMIC_FCMPSWAP_X2 opcode.
+    void
+    Inst_DS__DS_CMPST_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_F64 class methods ---
+
+    Inst_DS__DS_MIN_F64::Inst_DS__DS_MIN_F64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_f64")
+    {
+        setFlag(F64);
+    } // Inst_DS__DS_MIN_F64
+
+    Inst_DS__DS_MIN_F64::~Inst_DS__DS_MIN_F64()
+    {
+    } // ~Inst_DS__DS_MIN_F64
+
+    // --- description from .arch file ---
+    // 64b.
+    // tmp = MEM[ADDR];
+    // src = DATA;
+    // cmp = DATA2;
+    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
+    // Floating point minimum that handles NaN/INF/denormal values.
+    // Note that this opcode is slightly more general-purpose than
+    // ---  BUFFER_ATOMIC_FMIN_X2.
+    void
+    Inst_DS__DS_MIN_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_F64 class methods ---
+
+    Inst_DS__DS_MAX_F64::Inst_DS__DS_MAX_F64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_f64")
+    {
+        setFlag(F64);
+    } // Inst_DS__DS_MAX_F64
+
+    Inst_DS__DS_MAX_F64::~Inst_DS__DS_MAX_F64()
+    {
+    } // ~Inst_DS__DS_MAX_F64
+
+    // --- description from .arch file ---
+    // 64b.
+    // tmp = MEM[ADDR];
+    // src = DATA;
+    // cmp = DATA2;
+    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
+    // Floating point maximum that handles NaN/INF/denormal values.
+    // Note that this opcode is slightly more general-purpose than
+    // ---  BUFFER_ATOMIC_FMAX_X2.
+    void
+    Inst_DS__DS_MAX_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_ADD_RTN_U64 class methods ---
+
+    Inst_DS__DS_ADD_RTN_U64::Inst_DS__DS_ADD_RTN_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_add_rtn_u64")
+    {
+    } // Inst_DS__DS_ADD_RTN_U64
+
+    Inst_DS__DS_ADD_RTN_U64::~Inst_DS__DS_ADD_RTN_U64()
+    {
+    } // ~Inst_DS__DS_ADD_RTN_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] += DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_ADD_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_SUB_RTN_U64 class methods ---
+
+    Inst_DS__DS_SUB_RTN_U64::Inst_DS__DS_SUB_RTN_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_sub_rtn_u64")
+    {
+    } // Inst_DS__DS_SUB_RTN_U64
+
+    Inst_DS__DS_SUB_RTN_U64::~Inst_DS__DS_SUB_RTN_U64()
+    {
+    } // ~Inst_DS__DS_SUB_RTN_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_SUB_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_RSUB_RTN_U64 class methods ---
+
+    Inst_DS__DS_RSUB_RTN_U64::Inst_DS__DS_RSUB_RTN_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_rsub_rtn_u64")
+    {
+    } // Inst_DS__DS_RSUB_RTN_U64
+
+    Inst_DS__DS_RSUB_RTN_U64::~Inst_DS__DS_RSUB_RTN_U64()
+    {
+    } // ~Inst_DS__DS_RSUB_RTN_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA - MEM[ADDR];
+    // RETURN_DATA = tmp.
+    // Subtraction with reversed operands.
+    void
+    Inst_DS__DS_RSUB_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_INC_RTN_U64 class methods ---
+
+    Inst_DS__DS_INC_RTN_U64::Inst_DS__DS_INC_RTN_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_inc_rtn_u64")
+    {
+    } // Inst_DS__DS_INC_RTN_U64
+
+    Inst_DS__DS_INC_RTN_U64::~Inst_DS__DS_INC_RTN_U64()
+    {
+    } // ~Inst_DS__DS_INC_RTN_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_INC_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_DEC_RTN_U64 class methods ---
+
+    Inst_DS__DS_DEC_RTN_U64::Inst_DS__DS_DEC_RTN_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_dec_rtn_u64")
+    {
+    } // Inst_DS__DS_DEC_RTN_U64
+
+    Inst_DS__DS_DEC_RTN_U64::~Inst_DS__DS_DEC_RTN_U64()
+    {
+    } // ~Inst_DS__DS_DEC_RTN_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
+    // (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_DEC_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_RTN_I64 class methods ---
+
+    Inst_DS__DS_MIN_RTN_I64::Inst_DS__DS_MIN_RTN_I64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_rtn_i64")
+    {
+    } // Inst_DS__DS_MIN_RTN_I64
+
+    Inst_DS__DS_MIN_RTN_I64::~Inst_DS__DS_MIN_RTN_I64()
+    {
+    } // ~Inst_DS__DS_MIN_RTN_I64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_MIN_RTN_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_RTN_I64 class methods ---
+
+    Inst_DS__DS_MAX_RTN_I64::Inst_DS__DS_MAX_RTN_I64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_rtn_i64")
+    {
+    } // Inst_DS__DS_MAX_RTN_I64
+
+    Inst_DS__DS_MAX_RTN_I64::~Inst_DS__DS_MAX_RTN_I64()
+    {
+    } // ~Inst_DS__DS_MAX_RTN_I64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_MAX_RTN_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_RTN_U64 class methods ---
+
+    Inst_DS__DS_MIN_RTN_U64::Inst_DS__DS_MIN_RTN_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_rtn_u64")
+    {
+    } // Inst_DS__DS_MIN_RTN_U64
+
+    Inst_DS__DS_MIN_RTN_U64::~Inst_DS__DS_MIN_RTN_U64()
+    {
+    } // ~Inst_DS__DS_MIN_RTN_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_MIN_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_RTN_U64 class methods ---
+
+    Inst_DS__DS_MAX_RTN_U64::Inst_DS__DS_MAX_RTN_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_rtn_u64")
+    {
+    } // Inst_DS__DS_MAX_RTN_U64
+
+    Inst_DS__DS_MAX_RTN_U64::~Inst_DS__DS_MAX_RTN_U64()
+    {
+    } // ~Inst_DS__DS_MAX_RTN_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_MAX_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_AND_RTN_B64 class methods ---
+
+    Inst_DS__DS_AND_RTN_B64::Inst_DS__DS_AND_RTN_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_and_rtn_b64")
+    {
+    } // Inst_DS__DS_AND_RTN_B64
+
+    Inst_DS__DS_AND_RTN_B64::~Inst_DS__DS_AND_RTN_B64()
+    {
+    } // ~Inst_DS__DS_AND_RTN_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] &= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_AND_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_OR_RTN_B64 class methods ---
+
+    Inst_DS__DS_OR_RTN_B64::Inst_DS__DS_OR_RTN_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_or_rtn_b64")
+    {
+    } // Inst_DS__DS_OR_RTN_B64
+
+    Inst_DS__DS_OR_RTN_B64::~Inst_DS__DS_OR_RTN_B64()
+    {
+    } // ~Inst_DS__DS_OR_RTN_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] |= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_OR_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_XOR_RTN_B64 class methods ---
+
+    Inst_DS__DS_XOR_RTN_B64::Inst_DS__DS_XOR_RTN_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_xor_rtn_b64")
+    {
+    } // Inst_DS__DS_XOR_RTN_B64
+
+    Inst_DS__DS_XOR_RTN_B64::~Inst_DS__DS_XOR_RTN_B64()
+    {
+    } // ~Inst_DS__DS_XOR_RTN_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] ^= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_XOR_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MSKOR_RTN_B64 class methods ---
+
+    Inst_DS__DS_MSKOR_RTN_B64::Inst_DS__DS_MSKOR_RTN_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_mskor_rtn_b64")
+    {
+    } // Inst_DS__DS_MSKOR_RTN_B64
+
+    Inst_DS__DS_MSKOR_RTN_B64::~Inst_DS__DS_MSKOR_RTN_B64()
+    {
+    } // ~Inst_DS__DS_MSKOR_RTN_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
+    // RETURN_DATA = tmp.
+    // Masked dword OR, D0 contains the mask and D1 contains the new value.
+    void
+    Inst_DS__DS_MSKOR_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRXCHG_RTN_B64 class methods ---
+
+    Inst_DS__DS_WRXCHG_RTN_B64::Inst_DS__DS_WRXCHG_RTN_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_wrxchg_rtn_b64")
+    {
+    } // Inst_DS__DS_WRXCHG_RTN_B64
+
+    Inst_DS__DS_WRXCHG_RTN_B64::~Inst_DS__DS_WRXCHG_RTN_B64()
+    {
+    } // ~Inst_DS__DS_WRXCHG_RTN_B64
+
+    // --- description from .arch file ---
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA;
+    // RETURN_DATA = tmp.
+    // Write-exchange operation.
+    void
+    Inst_DS__DS_WRXCHG_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRXCHG2_RTN_B64 class methods ---
+
+    Inst_DS__DS_WRXCHG2_RTN_B64::Inst_DS__DS_WRXCHG2_RTN_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_wrxchg2_rtn_b64")
+    {
+    } // Inst_DS__DS_WRXCHG2_RTN_B64
+
+    Inst_DS__DS_WRXCHG2_RTN_B64::~Inst_DS__DS_WRXCHG2_RTN_B64()
+    {
+    } // ~Inst_DS__DS_WRXCHG2_RTN_B64
+
+    // --- description from .arch file ---
+    // Write-exchange 2 separate qwords.
+    void
+    Inst_DS__DS_WRXCHG2_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRXCHG2ST64_RTN_B64 class methods ---
+
+    Inst_DS__DS_WRXCHG2ST64_RTN_B64::Inst_DS__DS_WRXCHG2ST64_RTN_B64(
+          InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_wrxchg2st64_rtn_b64")
+    {
+    } // Inst_DS__DS_WRXCHG2ST64_RTN_B64
+
+    Inst_DS__DS_WRXCHG2ST64_RTN_B64::~Inst_DS__DS_WRXCHG2ST64_RTN_B64()
+    {
+    } // ~Inst_DS__DS_WRXCHG2ST64_RTN_B64
+
+    // --- description from .arch file ---
+    // Write-exchange 2 qwords with a stride of 64 qwords.
+    void
+    Inst_DS__DS_WRXCHG2ST64_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_CMPST_RTN_B64 class methods ---
+
+    Inst_DS__DS_CMPST_RTN_B64::Inst_DS__DS_CMPST_RTN_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_cmpst_rtn_b64")
+    {
+    } // Inst_DS__DS_CMPST_RTN_B64
+
+    Inst_DS__DS_CMPST_RTN_B64::~Inst_DS__DS_CMPST_RTN_B64()
+    {
+    } // ~Inst_DS__DS_CMPST_RTN_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // src = DATA2;
+    // cmp = DATA;
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    // Compare and store.
+    // Caution, the order of src and cmp are the *opposite* of the
+    // ---  BUFFER_ATOMIC_CMPSWAP_X2 opcode.
+    void
+    Inst_DS__DS_CMPST_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_CMPST_RTN_F64 class methods ---
+
+    Inst_DS__DS_CMPST_RTN_F64::Inst_DS__DS_CMPST_RTN_F64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_cmpst_rtn_f64")
+    {
+        setFlag(F64);
+    } // Inst_DS__DS_CMPST_RTN_F64
+
+    Inst_DS__DS_CMPST_RTN_F64::~Inst_DS__DS_CMPST_RTN_F64()
+    {
+    } // ~Inst_DS__DS_CMPST_RTN_F64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // src = DATA2;
+    // cmp = DATA;
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    // Floating point compare and store that handles NaN/INF/denormal values.
+    // Caution, the order of src and cmp are the *opposite* of the
+    // ---  BUFFER_ATOMIC_FCMPSWAP_X2 opcode.
+    void
+    Inst_DS__DS_CMPST_RTN_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_RTN_F64 class methods ---
+
+    Inst_DS__DS_MIN_RTN_F64::Inst_DS__DS_MIN_RTN_F64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_rtn_f64")
+    {
+        setFlag(F64);
+    } // Inst_DS__DS_MIN_RTN_F64
+
+    Inst_DS__DS_MIN_RTN_F64::~Inst_DS__DS_MIN_RTN_F64()
+    {
+    } // ~Inst_DS__DS_MIN_RTN_F64
+
+    // --- description from .arch file ---
+    // 64b.
+    // tmp = MEM[ADDR];
+    // src = DATA;
+    // cmp = DATA2;
+    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
+    // Floating point minimum that handles NaN/INF/denormal values.
+    // Note that this opcode is slightly more general-purpose than
+    // ---  BUFFER_ATOMIC_FMIN_X2.
+    void
+    Inst_DS__DS_MIN_RTN_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_RTN_F64 class methods ---
+
+    Inst_DS__DS_MAX_RTN_F64::Inst_DS__DS_MAX_RTN_F64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_rtn_f64")
+    {
+        setFlag(F64);
+    } // Inst_DS__DS_MAX_RTN_F64
+
+    Inst_DS__DS_MAX_RTN_F64::~Inst_DS__DS_MAX_RTN_F64()
+    {
+    } // ~Inst_DS__DS_MAX_RTN_F64
+
+    // --- description from .arch file ---
+    // 64b.
+    // tmp = MEM[ADDR];
+    // src = DATA;
+    // cmp = DATA2;
+    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
+    // Floating point maximum that handles NaN/INF/denormal values.
+    // Note that this opcode is slightly more general-purpose than
+    // ---  BUFFER_ATOMIC_FMAX_X2.
+    void
+    Inst_DS__DS_MAX_RTN_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_READ_B64 class methods ---
+
+    Inst_DS__DS_READ_B64::Inst_DS__DS_READ_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read_b64")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ_B64
+
+    Inst_DS__DS_READ_B64::~Inst_DS__DS_READ_B64()
+    {
+    } // ~Inst_DS__DS_READ_B64
+
+    // --- description from .arch file ---
+    // RETURN_DATA = MEM[ADDR].
+    // Read 1 qword.
+    void
+    Inst_DS__DS_READ_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemRead<VecElemU64>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ_B64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU64 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst[lane] = (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane];
+            }
+        }
+
+        vdst.write();
+    } // completeAcc
+    // --- Inst_DS__DS_READ2_B64 class methods ---
+
+    Inst_DS__DS_READ2_B64::Inst_DS__DS_READ2_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read2_b64")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ2_B64
+
+    Inst_DS__DS_READ2_B64::~Inst_DS__DS_READ2_B64()
+    {
+    } // ~Inst_DS__DS_READ2_B64
+
+    // --- description from .arch file ---
+    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 8];
+    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 8].
+    // Read 2 qwords.
+    void
+    Inst_DS__DS_READ2_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ2_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0 * 8;
+        Addr offset1 = instData.OFFSET1 * 8;
+
+        initDualMemRead<VecElemU64>(gpuDynInst, offset0, offset1);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ2_B64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU64 vdst0(gpuDynInst, extData.VDST);
+        VecOperandU64 vdst1(gpuDynInst, extData.VDST + 2);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst0[lane] = (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane * 2];
+                vdst1[lane] = (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane * 2 + 1];
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+    } // completeAcc
+    // --- Inst_DS__DS_READ2ST64_B64 class methods ---
+
+    Inst_DS__DS_READ2ST64_B64::Inst_DS__DS_READ2ST64_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read2st64_b64")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ2ST64_B64
+
+    Inst_DS__DS_READ2ST64_B64::~Inst_DS__DS_READ2ST64_B64()
+    {
+    } // ~Inst_DS__DS_READ2ST64_B64
+
+    // --- description from .arch file ---
+    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 8 * 64];
+    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 8 * 64].
+    // Read 2 qwords.
+    void
+    Inst_DS__DS_READ2ST64_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ2ST64_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = (instData.OFFSET0 * 8 * 64);
+        Addr offset1 = (instData.OFFSET1 * 8 * 64);
+
+        initDualMemRead<VecElemU64>(gpuDynInst, offset0, offset1);
+    }
+
+    void
+    Inst_DS__DS_READ2ST64_B64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU64 vdst0(gpuDynInst, extData.VDST);
+        VecOperandU64 vdst1(gpuDynInst, extData.VDST + 2);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst0[lane] = (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane * 2];
+                vdst1[lane] = (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane * 2 + 1];
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+    }
+    // --- Inst_DS__DS_CONDXCHG32_RTN_B64 class methods ---
+
+    Inst_DS__DS_CONDXCHG32_RTN_B64::Inst_DS__DS_CONDXCHG32_RTN_B64(
+          InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_condxchg32_rtn_b64")
+    {
+    } // Inst_DS__DS_CONDXCHG32_RTN_B64
+
+    Inst_DS__DS_CONDXCHG32_RTN_B64::~Inst_DS__DS_CONDXCHG32_RTN_B64()
+    {
+    } // ~Inst_DS__DS_CONDXCHG32_RTN_B64
+
+    // --- description from .arch file ---
+    // Conditional write exchange.
+    void
+    Inst_DS__DS_CONDXCHG32_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_ADD_SRC2_U32 class methods ---
+
+    Inst_DS__DS_ADD_SRC2_U32::Inst_DS__DS_ADD_SRC2_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_add_src2_u32")
+    {
+    } // Inst_DS__DS_ADD_SRC2_U32
+
+    Inst_DS__DS_ADD_SRC2_U32::~Inst_DS__DS_ADD_SRC2_U32()
+    {
+    } // ~Inst_DS__DS_ADD_SRC2_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] + MEM[B].
+    void
+    Inst_DS__DS_ADD_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_SUB_SRC2_U32 class methods ---
+
+    Inst_DS__DS_SUB_SRC2_U32::Inst_DS__DS_SUB_SRC2_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_sub_src2_u32")
+    {
+    } // Inst_DS__DS_SUB_SRC2_U32
+
+    Inst_DS__DS_SUB_SRC2_U32::~Inst_DS__DS_SUB_SRC2_U32()
+    {
+    } // ~Inst_DS__DS_SUB_SRC2_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] - MEM[B].
+    void
+    Inst_DS__DS_SUB_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_RSUB_SRC2_U32 class methods ---
+
+    Inst_DS__DS_RSUB_SRC2_U32::Inst_DS__DS_RSUB_SRC2_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_rsub_src2_u32")
+    {
+    } // Inst_DS__DS_RSUB_SRC2_U32
+
+    Inst_DS__DS_RSUB_SRC2_U32::~Inst_DS__DS_RSUB_SRC2_U32()
+    {
+    } // ~Inst_DS__DS_RSUB_SRC2_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[B] - MEM[A].
+    void
+    Inst_DS__DS_RSUB_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_INC_SRC2_U32 class methods ---
+
+    Inst_DS__DS_INC_SRC2_U32::Inst_DS__DS_INC_SRC2_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_inc_src2_u32")
+    {
+    } // Inst_DS__DS_INC_SRC2_U32
+
+    Inst_DS__DS_INC_SRC2_U32::~Inst_DS__DS_INC_SRC2_U32()
+    {
+    } // ~Inst_DS__DS_INC_SRC2_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = (MEM[A] >= MEM[B] ? 0 : MEM[A] + 1).
+    void
+    Inst_DS__DS_INC_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_DEC_SRC2_U32 class methods ---
+
+    Inst_DS__DS_DEC_SRC2_U32::Inst_DS__DS_DEC_SRC2_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_dec_src2_u32")
+    {
+    } // Inst_DS__DS_DEC_SRC2_U32
+
+    Inst_DS__DS_DEC_SRC2_U32::~Inst_DS__DS_DEC_SRC2_U32()
+    {
+    } // ~Inst_DS__DS_DEC_SRC2_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = (MEM[A] == 0 || MEM[A] > MEM[B] ? MEM[B] : MEM[A] - 1).
+    // Uint decrement.
+    void
+    Inst_DS__DS_DEC_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_SRC2_I32 class methods ---
+
+    Inst_DS__DS_MIN_SRC2_I32::Inst_DS__DS_MIN_SRC2_I32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_src2_i32")
+    {
+    } // Inst_DS__DS_MIN_SRC2_I32
+
+    Inst_DS__DS_MIN_SRC2_I32::~Inst_DS__DS_MIN_SRC2_I32()
+    {
+    } // ~Inst_DS__DS_MIN_SRC2_I32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = min(MEM[A], MEM[B]).
+    void
+    Inst_DS__DS_MIN_SRC2_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_SRC2_I32 class methods ---
+
+    Inst_DS__DS_MAX_SRC2_I32::Inst_DS__DS_MAX_SRC2_I32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_src2_i32")
+    {
+    } // Inst_DS__DS_MAX_SRC2_I32
+
+    Inst_DS__DS_MAX_SRC2_I32::~Inst_DS__DS_MAX_SRC2_I32()
+    {
+    } // ~Inst_DS__DS_MAX_SRC2_I32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = max(MEM[A], MEM[B]).
+    void
+    Inst_DS__DS_MAX_SRC2_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_SRC2_U32 class methods ---
+
+    Inst_DS__DS_MIN_SRC2_U32::Inst_DS__DS_MIN_SRC2_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_src2_u32")
+    {
+    } // Inst_DS__DS_MIN_SRC2_U32
+
+    Inst_DS__DS_MIN_SRC2_U32::~Inst_DS__DS_MIN_SRC2_U32()
+    {
+    } // ~Inst_DS__DS_MIN_SRC2_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = min(MEM[A], MEM[B]).
+    void
+    Inst_DS__DS_MIN_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_SRC2_U32 class methods ---
+
+    Inst_DS__DS_MAX_SRC2_U32::Inst_DS__DS_MAX_SRC2_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_src2_u32")
+    {
+    } // Inst_DS__DS_MAX_SRC2_U32
+
+    Inst_DS__DS_MAX_SRC2_U32::~Inst_DS__DS_MAX_SRC2_U32()
+    {
+    } // ~Inst_DS__DS_MAX_SRC2_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = max(MEM[A], MEM[B]).
+    void
+    Inst_DS__DS_MAX_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_AND_SRC2_B32 class methods ---
+
+    Inst_DS__DS_AND_SRC2_B32::Inst_DS__DS_AND_SRC2_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_and_src2_b32")
+    {
+    } // Inst_DS__DS_AND_SRC2_B32
+
+    Inst_DS__DS_AND_SRC2_B32::~Inst_DS__DS_AND_SRC2_B32()
+    {
+    } // ~Inst_DS__DS_AND_SRC2_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] & MEM[B].
+    void
+    Inst_DS__DS_AND_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_OR_SRC2_B32 class methods ---
+
+    Inst_DS__DS_OR_SRC2_B32::Inst_DS__DS_OR_SRC2_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_or_src2_b32")
+    {
+    } // Inst_DS__DS_OR_SRC2_B32
+
+    Inst_DS__DS_OR_SRC2_B32::~Inst_DS__DS_OR_SRC2_B32()
+    {
+    } // ~Inst_DS__DS_OR_SRC2_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] | MEM[B].
+    void
+    Inst_DS__DS_OR_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_XOR_SRC2_B32 class methods ---
+
+    Inst_DS__DS_XOR_SRC2_B32::Inst_DS__DS_XOR_SRC2_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_xor_src2_b32")
+    {
+    } // Inst_DS__DS_XOR_SRC2_B32
+
+    Inst_DS__DS_XOR_SRC2_B32::~Inst_DS__DS_XOR_SRC2_B32()
+    {
+    } // ~Inst_DS__DS_XOR_SRC2_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] ^ MEM[B].
+    void
+    Inst_DS__DS_XOR_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRITE_SRC2_B32 class methods ---
+
+    Inst_DS__DS_WRITE_SRC2_B32::Inst_DS__DS_WRITE_SRC2_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write_src2_b32")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE_SRC2_B32
+
+    Inst_DS__DS_WRITE_SRC2_B32::~Inst_DS__DS_WRITE_SRC2_B32()
+    {
+    } // ~Inst_DS__DS_WRITE_SRC2_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[B].
+    // Write dword.
+    void
+    Inst_DS__DS_WRITE_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_SRC2_F32 class methods ---
+
+    Inst_DS__DS_MIN_SRC2_F32::Inst_DS__DS_MIN_SRC2_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_src2_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_MIN_SRC2_F32
+
+    Inst_DS__DS_MIN_SRC2_F32::~Inst_DS__DS_MIN_SRC2_F32()
+    {
+    } // ~Inst_DS__DS_MIN_SRC2_F32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = (MEM[B] < MEM[A]) ? MEM[B] : MEM[A].
+    // Float, handles NaN/INF/denorm.
+    void
+    Inst_DS__DS_MIN_SRC2_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_SRC2_F32 class methods ---
+
+    Inst_DS__DS_MAX_SRC2_F32::Inst_DS__DS_MAX_SRC2_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_src2_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_MAX_SRC2_F32
+
+    Inst_DS__DS_MAX_SRC2_F32::~Inst_DS__DS_MAX_SRC2_F32()
+    {
+    } // ~Inst_DS__DS_MAX_SRC2_F32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = (MEM[B] > MEM[A]) ? MEM[B] : MEM[A].
+    // Float, handles NaN/INF/denorm.
+    void
+    Inst_DS__DS_MAX_SRC2_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_ADD_SRC2_F32 class methods ---
+
+    Inst_DS__DS_ADD_SRC2_F32::Inst_DS__DS_ADD_SRC2_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_add_src2_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_ADD_SRC2_F32
+
+    Inst_DS__DS_ADD_SRC2_F32::~Inst_DS__DS_ADD_SRC2_F32()
+    {
+    } // ~Inst_DS__DS_ADD_SRC2_F32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[B] + MEM[A].
+    // Float, handles NaN/INF/denorm.
+    void
+    Inst_DS__DS_ADD_SRC2_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_GWS_SEMA_RELEASE_ALL class methods ---
+
+    Inst_DS__DS_GWS_SEMA_RELEASE_ALL::Inst_DS__DS_GWS_SEMA_RELEASE_ALL(
+          InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_gws_sema_release_all")
+    {
+    } // Inst_DS__DS_GWS_SEMA_RELEASE_ALL
+
+    Inst_DS__DS_GWS_SEMA_RELEASE_ALL::~Inst_DS__DS_GWS_SEMA_RELEASE_ALL()
+    {
+    } // ~Inst_DS__DS_GWS_SEMA_RELEASE_ALL
+
+    // --- description from .arch file ---
+    // GDS Only: The GWS resource (rid) indicated will process this opcode by
+    // updating the counter and labeling the specified resource as a semaphore.
+    // //Determine the GWS resource to work on
+    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
+    // //Incr the state counter of the resource
+    // state.counter[rid] = state.wave_in_queue;
+    // state.type = SEMAPHORE;
+    // return rd_done; //release calling wave
+    // This action will release ALL queued waves; it Will have no effect if no
+    // ---  waves are present.
+    void
+    Inst_DS__DS_GWS_SEMA_RELEASE_ALL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_GWS_INIT class methods ---
+
+    Inst_DS__DS_GWS_INIT::Inst_DS__DS_GWS_INIT(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_gws_init")
+    {
+    } // Inst_DS__DS_GWS_INIT
+
+    Inst_DS__DS_GWS_INIT::~Inst_DS__DS_GWS_INIT()
+    {
+    } // ~Inst_DS__DS_GWS_INIT
+
+    // --- description from .arch file ---
+    // GDS Only: Initialize a barrier or semaphore resource.
+    // //Determine the GWS resource to work on
+    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
+    // //Get the value to use in init
+    // index = find_first_valid(vector mask)
+    // value = DATA[thread: index]
+    // //Set the state of the resource
+    // state.counter[rid] = lsb(value); //limit #waves
+    // state.flag[rid] = 0;
+    // return rd_done; //release calling wave
+    void
+    Inst_DS__DS_GWS_INIT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_GWS_SEMA_V class methods ---
+
+    Inst_DS__DS_GWS_SEMA_V::Inst_DS__DS_GWS_SEMA_V(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_gws_sema_v")
+    {
+    } // Inst_DS__DS_GWS_SEMA_V
+
+    Inst_DS__DS_GWS_SEMA_V::~Inst_DS__DS_GWS_SEMA_V()
+    {
+    } // ~Inst_DS__DS_GWS_SEMA_V
+
+    // --- description from .arch file ---
+    // GDS Only: The GWS resource indicated will process this opcode by
+    // updating the counter and labeling the resource as a semaphore.
+    // //Determine the GWS resource to work on
+    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
+    // //Incr the state counter of the resource
+    // state.counter[rid]++;
+    // state.type = SEMAPHORE;
+    // return rd_done; //release calling wave
+    // This action will release one waved if any are queued in this resource.
+    void
+    Inst_DS__DS_GWS_SEMA_V::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_GWS_SEMA_BR class methods ---
+
+    Inst_DS__DS_GWS_SEMA_BR::Inst_DS__DS_GWS_SEMA_BR(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_gws_sema_br")
+    {
+    } // Inst_DS__DS_GWS_SEMA_BR
+
+    Inst_DS__DS_GWS_SEMA_BR::~Inst_DS__DS_GWS_SEMA_BR()
+    {
+    } // ~Inst_DS__DS_GWS_SEMA_BR
+
+    // --- description from .arch file ---
+    // GDS Only: The GWS resource indicated will process this opcode by
+    // updating the counter by the bulk release delivered count and labeling
+    // the resource as a semaphore.
+    // //Determine the GWS resource to work on
+    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
+    // index =  find first valid (vector mask)
+    // count = DATA[thread: index];
+    // //Add count to the resource state counter
+    // state.counter[rid] += count;
+    // state.type = SEMAPHORE;
+    // return rd_done; //release calling wave
+    // This action will release count number of waves, immediately if queued,
+    // or as they arrive from the noted resource.
+    void
+    Inst_DS__DS_GWS_SEMA_BR::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_GWS_SEMA_P class methods ---
+
+    Inst_DS__DS_GWS_SEMA_P::Inst_DS__DS_GWS_SEMA_P(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_gws_sema_p")
+    {
+    } // Inst_DS__DS_GWS_SEMA_P
+
+    Inst_DS__DS_GWS_SEMA_P::~Inst_DS__DS_GWS_SEMA_P()
+    {
+    } // ~Inst_DS__DS_GWS_SEMA_P
+
+    // --- description from .arch file ---
+    // GDS Only: The GWS resource indicated will process this opcode by
+    // queueing it until counter enables a release and then decrementing the
+    // counter of the resource as a semaphore.
+    // //Determine the GWS resource to work on
+    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
+    // state.type = SEMAPHORE;
+    // ENQUEUE until(state[rid].counter > 0)
+    // state[rid].counter--;
+    // return rd_done
+    void
+    Inst_DS__DS_GWS_SEMA_P::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_GWS_BARRIER class methods ---
+
+    Inst_DS__DS_GWS_BARRIER::Inst_DS__DS_GWS_BARRIER(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_gws_barrier")
+    {
+    } // Inst_DS__DS_GWS_BARRIER
+
+    Inst_DS__DS_GWS_BARRIER::~Inst_DS__DS_GWS_BARRIER()
+    {
+    } // ~Inst_DS__DS_GWS_BARRIER
+
+    // --- description from .arch file ---
+    // GDS Only: The GWS resource indicated will process this opcode by
+    // queueing it until barrier is satisfied. The number of waves needed is
+    // passed in as DATA of first valid thread.
+    // //Determine the GWS resource to work on
+    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + OFFSET0[5:0];
+    // index =  find first valid (vector mask);
+    // value = DATA[thread: index];
+    // // Input Decision Machine
+    // state.type[rid] = BARRIER;
+    // if (state[rid].counter <= 0) {
+    //     thread[rid].flag = state[rid].flag;
+    //     ENQUEUE;
+    //     state[rid].flag = !state.flag;
+    //     state[rid].counter = value;
+    //     return rd_done;
+    // } else {
+    //     state[rid].counter--;
+    //     thread.flag = state[rid].flag;
+    //     ENQUEUE;
+    // }
+    // Since the waves deliver the count for the next barrier, this function
+    // can have a different size barrier for each occurrence.
+    // // Release Machine
+    // if (state.type == BARRIER) {
+    //     if (state.flag != thread.flag) {
+    //         return rd_done;
+    //     }
+    // }
+    void
+    Inst_DS__DS_GWS_BARRIER::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_CONSUME class methods ---
+
+    Inst_DS__DS_CONSUME::Inst_DS__DS_CONSUME(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_consume")
+    {
+    } // Inst_DS__DS_CONSUME
+
+    Inst_DS__DS_CONSUME::~Inst_DS__DS_CONSUME()
+    {
+    } // ~Inst_DS__DS_CONSUME
+
+    // --- description from .arch file ---
+    // LDS & GDS. Subtract (count_bits(exec_mask)) from the value stored in DS
+    // memory at (M0.base + instr_offset). Return the pre-operation value to
+    // VGPRs.
+    void
+    Inst_DS__DS_CONSUME::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_APPEND class methods ---
+
+    Inst_DS__DS_APPEND::Inst_DS__DS_APPEND(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_append")
+    {
+    } // Inst_DS__DS_APPEND
+
+    Inst_DS__DS_APPEND::~Inst_DS__DS_APPEND()
+    {
+    } // ~Inst_DS__DS_APPEND
+
+    // --- description from .arch file ---
+    // LDS & GDS. Add (count_bits(exec_mask)) to the value stored in DS memory
+    // at (M0.base + instr_offset). Return the pre-operation value to VGPRs.
+    void
+    Inst_DS__DS_APPEND::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_ORDERED_COUNT class methods ---
+
+    Inst_DS__DS_ORDERED_COUNT::Inst_DS__DS_ORDERED_COUNT(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_ordered_count")
+    {
+    } // Inst_DS__DS_ORDERED_COUNT
+
+    Inst_DS__DS_ORDERED_COUNT::~Inst_DS__DS_ORDERED_COUNT()
+    {
+    } // ~Inst_DS__DS_ORDERED_COUNT
+
+    // --- description from .arch file ---
+    // GDS-only. Add (count_bits(exec_mask)) to one of 4 dedicated
+    // ordered-count counters (aka 'packers'). Additional bits of instr.offset
+    // field are overloaded to hold packer-id, 'last'.
+    void
+    Inst_DS__DS_ORDERED_COUNT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_ADD_SRC2_U64 class methods ---
+
+    Inst_DS__DS_ADD_SRC2_U64::Inst_DS__DS_ADD_SRC2_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_add_src2_u64")
+    {
+    } // Inst_DS__DS_ADD_SRC2_U64
+
+    Inst_DS__DS_ADD_SRC2_U64::~Inst_DS__DS_ADD_SRC2_U64()
+    {
+    } // ~Inst_DS__DS_ADD_SRC2_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] + MEM[B].
+    void
+    Inst_DS__DS_ADD_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_SUB_SRC2_U64 class methods ---
+
+    Inst_DS__DS_SUB_SRC2_U64::Inst_DS__DS_SUB_SRC2_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_sub_src2_u64")
+    {
+    } // Inst_DS__DS_SUB_SRC2_U64
+
+    Inst_DS__DS_SUB_SRC2_U64::~Inst_DS__DS_SUB_SRC2_U64()
+    {
+    } // ~Inst_DS__DS_SUB_SRC2_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] - MEM[B].
+    void
+    Inst_DS__DS_SUB_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_RSUB_SRC2_U64 class methods ---
+
+    Inst_DS__DS_RSUB_SRC2_U64::Inst_DS__DS_RSUB_SRC2_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_rsub_src2_u64")
+    {
+    } // Inst_DS__DS_RSUB_SRC2_U64
+
+    Inst_DS__DS_RSUB_SRC2_U64::~Inst_DS__DS_RSUB_SRC2_U64()
+    {
+    } // ~Inst_DS__DS_RSUB_SRC2_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[B] - MEM[A].
+    void
+    Inst_DS__DS_RSUB_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_INC_SRC2_U64 class methods ---
+
+    Inst_DS__DS_INC_SRC2_U64::Inst_DS__DS_INC_SRC2_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_inc_src2_u64")
+    {
+    } // Inst_DS__DS_INC_SRC2_U64
+
+    Inst_DS__DS_INC_SRC2_U64::~Inst_DS__DS_INC_SRC2_U64()
+    {
+    } // ~Inst_DS__DS_INC_SRC2_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = (MEM[A] >= MEM[B] ? 0 : MEM[A] + 1).
+    void
+    Inst_DS__DS_INC_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_DEC_SRC2_U64 class methods ---
+
+    Inst_DS__DS_DEC_SRC2_U64::Inst_DS__DS_DEC_SRC2_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_dec_src2_u64")
+    {
+    } // Inst_DS__DS_DEC_SRC2_U64
+
+    Inst_DS__DS_DEC_SRC2_U64::~Inst_DS__DS_DEC_SRC2_U64()
+    {
+    } // ~Inst_DS__DS_DEC_SRC2_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = (MEM[A] == 0 || MEM[A] > MEM[B] ? MEM[B] : MEM[A] - 1).
+    // Uint decrement.
+    void
+    Inst_DS__DS_DEC_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_SRC2_I64 class methods ---
+
+    Inst_DS__DS_MIN_SRC2_I64::Inst_DS__DS_MIN_SRC2_I64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_src2_i64")
+    {
+    } // Inst_DS__DS_MIN_SRC2_I64
+
+    Inst_DS__DS_MIN_SRC2_I64::~Inst_DS__DS_MIN_SRC2_I64()
+    {
+    } // ~Inst_DS__DS_MIN_SRC2_I64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = min(MEM[A], MEM[B]).
+    void
+    Inst_DS__DS_MIN_SRC2_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_SRC2_I64 class methods ---
+
+    Inst_DS__DS_MAX_SRC2_I64::Inst_DS__DS_MAX_SRC2_I64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_src2_i64")
+    {
+    } // Inst_DS__DS_MAX_SRC2_I64
+
+    Inst_DS__DS_MAX_SRC2_I64::~Inst_DS__DS_MAX_SRC2_I64()
+    {
+    } // ~Inst_DS__DS_MAX_SRC2_I64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = max(MEM[A], MEM[B]).
+    void
+    Inst_DS__DS_MAX_SRC2_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_SRC2_U64 class methods ---
+
+    Inst_DS__DS_MIN_SRC2_U64::Inst_DS__DS_MIN_SRC2_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_src2_u64")
+    {
+    } // Inst_DS__DS_MIN_SRC2_U64
+
+    Inst_DS__DS_MIN_SRC2_U64::~Inst_DS__DS_MIN_SRC2_U64()
+    {
+    } // ~Inst_DS__DS_MIN_SRC2_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = min(MEM[A], MEM[B]).
+    void
+    Inst_DS__DS_MIN_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_SRC2_U64 class methods ---
+
+    Inst_DS__DS_MAX_SRC2_U64::Inst_DS__DS_MAX_SRC2_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_src2_u64")
+    {
+    } // Inst_DS__DS_MAX_SRC2_U64
+
+    Inst_DS__DS_MAX_SRC2_U64::~Inst_DS__DS_MAX_SRC2_U64()
+    {
+    } // ~Inst_DS__DS_MAX_SRC2_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = max(MEM[A], MEM[B]).
+    void
+    Inst_DS__DS_MAX_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_AND_SRC2_B64 class methods ---
+
+    Inst_DS__DS_AND_SRC2_B64::Inst_DS__DS_AND_SRC2_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_and_src2_b64")
+    {
+    } // Inst_DS__DS_AND_SRC2_B64
+
+    Inst_DS__DS_AND_SRC2_B64::~Inst_DS__DS_AND_SRC2_B64()
+    {
+    } // ~Inst_DS__DS_AND_SRC2_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] & MEM[B].
+    void
+    Inst_DS__DS_AND_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_OR_SRC2_B64 class methods ---
+
+    Inst_DS__DS_OR_SRC2_B64::Inst_DS__DS_OR_SRC2_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_or_src2_b64")
+    {
+    } // Inst_DS__DS_OR_SRC2_B64
+
+    Inst_DS__DS_OR_SRC2_B64::~Inst_DS__DS_OR_SRC2_B64()
+    {
+    } // ~Inst_DS__DS_OR_SRC2_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] | MEM[B].
+    void
+    Inst_DS__DS_OR_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_XOR_SRC2_B64 class methods ---
+
+    Inst_DS__DS_XOR_SRC2_B64::Inst_DS__DS_XOR_SRC2_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_xor_src2_b64")
+    {
+    } // Inst_DS__DS_XOR_SRC2_B64
+
+    Inst_DS__DS_XOR_SRC2_B64::~Inst_DS__DS_XOR_SRC2_B64()
+    {
+    } // ~Inst_DS__DS_XOR_SRC2_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] ^ MEM[B].
+    void
+    Inst_DS__DS_XOR_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRITE_SRC2_B64 class methods ---
+
+    Inst_DS__DS_WRITE_SRC2_B64::Inst_DS__DS_WRITE_SRC2_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write_src2_b64")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE_SRC2_B64
+
+    Inst_DS__DS_WRITE_SRC2_B64::~Inst_DS__DS_WRITE_SRC2_B64()
+    {
+    } // ~Inst_DS__DS_WRITE_SRC2_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[B].
+    // Write qword.
+    void
+    Inst_DS__DS_WRITE_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_SRC2_F64 class methods ---
+
+    Inst_DS__DS_MIN_SRC2_F64::Inst_DS__DS_MIN_SRC2_F64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_src2_f64")
+    {
+        setFlag(F64);
+    } // Inst_DS__DS_MIN_SRC2_F64
+
+    Inst_DS__DS_MIN_SRC2_F64::~Inst_DS__DS_MIN_SRC2_F64()
+    {
+    } // ~Inst_DS__DS_MIN_SRC2_F64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = (MEM[B] < MEM[A]) ? MEM[B] : MEM[A].
+    // Float, handles NaN/INF/denorm.
+    void
+    Inst_DS__DS_MIN_SRC2_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_SRC2_F64 class methods ---
+
+    Inst_DS__DS_MAX_SRC2_F64::Inst_DS__DS_MAX_SRC2_F64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_src2_f64")
+    {
+        setFlag(F64);
+    } // Inst_DS__DS_MAX_SRC2_F64
+
+    Inst_DS__DS_MAX_SRC2_F64::~Inst_DS__DS_MAX_SRC2_F64()
+    {
+    } // ~Inst_DS__DS_MAX_SRC2_F64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = (MEM[B] > MEM[A]) ? MEM[B] : MEM[A].
+    // Float, handles NaN/INF/denorm.
+    void
+    Inst_DS__DS_MAX_SRC2_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRITE_B96 class methods ---
+
+    Inst_DS__DS_WRITE_B96::Inst_DS__DS_WRITE_B96(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write_b96")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE_B96
+
+    Inst_DS__DS_WRITE_B96::~Inst_DS__DS_WRITE_B96()
+    {
+    } // ~Inst_DS__DS_WRITE_B96
+
+    // --- description from .arch file ---
+    // {MEM[ADDR + 8], MEM[ADDR + 4], MEM[ADDR]} = DATA[95:0].
+    // Tri-dword write.
+    void
+    Inst_DS__DS_WRITE_B96::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
+        ConstVecOperandU32 data1(gpuDynInst, extData.DATA0 + 1);
+        ConstVecOperandU32 data2(gpuDynInst, extData.DATA0 + 2);
+
+        addr.read();
+        data0.read();
+        data1.read();
+        data2.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4] = data0[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE_B96::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemWrite<3>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_WRITE_B96::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_WRITE_B128 class methods ---
+
+    Inst_DS__DS_WRITE_B128::Inst_DS__DS_WRITE_B128(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write_b128")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE_B128
+
+    Inst_DS__DS_WRITE_B128::~Inst_DS__DS_WRITE_B128()
+    {
+    } // ~Inst_DS__DS_WRITE_B128
+
+    // --- description from .arch file ---
+    // {MEM[ADDR + 12], MEM[ADDR + 8], MEM[ADDR + 4], MEM[ADDR]} = DATA[127:0].
+    // Qword write.
+    void
+    Inst_DS__DS_WRITE_B128::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
+        ConstVecOperandU32 data1(gpuDynInst, extData.DATA0 + 1);
+        ConstVecOperandU32 data2(gpuDynInst, extData.DATA0 + 2);
+        ConstVecOperandU32 data3(gpuDynInst, extData.DATA0 + 3);
+
+        addr.read();
+        data0.read();
+        data1.read();
+        data2.read();
+        data3.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4] = data0[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 3] = data3[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE_B128::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemWrite<4>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_WRITE_B128::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_READ_B96 class methods ---
+
+    Inst_DS__DS_READ_B96::Inst_DS__DS_READ_B96(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read_b96")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ_B96
+
+    Inst_DS__DS_READ_B96::~Inst_DS__DS_READ_B96()
+    {
+    } // ~Inst_DS__DS_READ_B96
+
+    // --- description from .arch file ---
+    // Tri-dword read.
+    void
+    Inst_DS__DS_READ_B96::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ_B96::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemRead<3>(gpuDynInst, offset);
+    }
+
+    void
+    Inst_DS__DS_READ_B96::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
+        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4];
+                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 1];
+                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 2];
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+        vdst2.write();
+    }
+    // --- Inst_DS__DS_READ_B128 class methods ---
+
+    Inst_DS__DS_READ_B128::Inst_DS__DS_READ_B128(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read_b128")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ_B128
+
+    Inst_DS__DS_READ_B128::~Inst_DS__DS_READ_B128()
+    {
+    } // ~Inst_DS__DS_READ_B128
+
+    // --- description from .arch file ---
+    // Qword read.
+    void
+    Inst_DS__DS_READ_B128::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ_B128::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemRead<4>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ_B128::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
+        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
+        VecOperandU32 vdst3(gpuDynInst, extData.VDST + 3);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4];
+                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 1];
+                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 2];
+                vdst3[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 3];
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+        vdst2.write();
+        vdst3.write();
+    } // completeAcc
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/exp.cc b/src/arch/amdgpu/vega/insts/exp.cc
new file mode 100644
index 0000000000..31b6ded10f
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/exp.cc
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_EXP__EXP class methods ---
+
+    Inst_EXP__EXP::Inst_EXP__EXP(InFmt_EXP *iFmt)
+        : Inst_EXP(iFmt, "exp")
+    {
+    } // Inst_EXP__EXP
+
+    Inst_EXP__EXP::~Inst_EXP__EXP()
+    {
+    } // ~Inst_EXP__EXP
+
+    // --- description from .arch file ---
+    // Export through SX.
+    void
+    Inst_EXP__EXP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/flat.cc b/src/arch/amdgpu/vega/insts/flat.cc
new file mode 100644
index 0000000000..7f79025b3f
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/flat.cc
@@ -0,0 +1,2138 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_FLAT__FLAT_LOAD_UBYTE class methods ---
+
+    Inst_FLAT__FLAT_LOAD_UBYTE::Inst_FLAT__FLAT_LOAD_UBYTE(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_load_ubyte")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_FLAT__FLAT_LOAD_UBYTE
+
+    Inst_FLAT__FLAT_LOAD_UBYTE::~Inst_FLAT__FLAT_LOAD_UBYTE()
+    {
+    } // ~Inst_FLAT__FLAT_LOAD_UBYTE
+
+    // --- description from .arch file ---
+    // Untyped buffer load unsigned byte (zero extend to VGPR destination).
+    void
+    Inst_FLAT__FLAT_LOAD_UBYTE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_LOAD_UBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<VecElemU8>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_LOAD_UBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU8*>(
+                    gpuDynInst->d_data))[lane]);
+            }
+        }
+        vdst.write();
+    } // execute
+    // --- Inst_FLAT__FLAT_LOAD_SBYTE class methods ---
+
+    Inst_FLAT__FLAT_LOAD_SBYTE::Inst_FLAT__FLAT_LOAD_SBYTE(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_load_sbyte")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_FLAT__FLAT_LOAD_SBYTE
+
+    Inst_FLAT__FLAT_LOAD_SBYTE::~Inst_FLAT__FLAT_LOAD_SBYTE()
+    {
+    } // ~Inst_FLAT__FLAT_LOAD_SBYTE
+
+    // --- description from .arch file ---
+    // Untyped buffer load signed byte (sign extend to VGPR destination).
+    void
+    Inst_FLAT__FLAT_LOAD_SBYTE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_LOAD_SBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_LOAD_SBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_FLAT__FLAT_LOAD_USHORT class methods ---
+
+    Inst_FLAT__FLAT_LOAD_USHORT::Inst_FLAT__FLAT_LOAD_USHORT(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_load_ushort")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_FLAT__FLAT_LOAD_USHORT
+
+    Inst_FLAT__FLAT_LOAD_USHORT::~Inst_FLAT__FLAT_LOAD_USHORT()
+    {
+    } // ~Inst_FLAT__FLAT_LOAD_USHORT
+
+    // --- description from .arch file ---
+    // Untyped buffer load unsigned short (zero extend to VGPR destination).
+    void
+    Inst_FLAT__FLAT_LOAD_USHORT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_LOAD_USHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<VecElemU16>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_LOAD_USHORT::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU16*>(
+                    gpuDynInst->d_data))[lane]);
+            }
+        }
+        vdst.write();
+    } // execute
+
+    // --- Inst_FLAT__FLAT_LOAD_SSHORT class methods ---
+
+    Inst_FLAT__FLAT_LOAD_SSHORT::Inst_FLAT__FLAT_LOAD_SSHORT(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_load_sshort")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_FLAT__FLAT_LOAD_SSHORT
+
+    Inst_FLAT__FLAT_LOAD_SSHORT::~Inst_FLAT__FLAT_LOAD_SSHORT()
+    {
+    } // ~Inst_FLAT__FLAT_LOAD_SSHORT
+
+    // --- description from .arch file ---
+    // Untyped buffer load signed short (sign extend to VGPR destination).
+    void
+    Inst_FLAT__FLAT_LOAD_SSHORT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_LOAD_SSHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_LOAD_SSHORT::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_FLAT__FLAT_LOAD_DWORD class methods ---
+
+    Inst_FLAT__FLAT_LOAD_DWORD::Inst_FLAT__FLAT_LOAD_DWORD(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_load_dword")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_FLAT__FLAT_LOAD_DWORD
+
+    Inst_FLAT__FLAT_LOAD_DWORD::~Inst_FLAT__FLAT_LOAD_DWORD()
+    {
+    } // ~Inst_FLAT__FLAT_LOAD_DWORD
+
+    // --- description from .arch file ---
+    // Untyped buffer load dword.
+    void
+    Inst_FLAT__FLAT_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane];
+            }
+        }
+        vdst.write();
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_LOAD_DWORDX2 class methods ---
+
+    Inst_FLAT__FLAT_LOAD_DWORDX2::Inst_FLAT__FLAT_LOAD_DWORDX2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_load_dwordx2")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_FLAT__FLAT_LOAD_DWORDX2
+
+    Inst_FLAT__FLAT_LOAD_DWORDX2::~Inst_FLAT__FLAT_LOAD_DWORDX2()
+    {
+    } // ~Inst_FLAT__FLAT_LOAD_DWORDX2
+
+    // --- description from .arch file ---
+    // Untyped buffer load 2 dwords.
+    void
+    Inst_FLAT__FLAT_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU64 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst[lane] = (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane];
+            }
+        }
+        vdst.write();
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_LOAD_DWORDX3 class methods ---
+
+    Inst_FLAT__FLAT_LOAD_DWORDX3::Inst_FLAT__FLAT_LOAD_DWORDX3(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_load_dwordx3")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_FLAT__FLAT_LOAD_DWORDX3
+
+    Inst_FLAT__FLAT_LOAD_DWORDX3::~Inst_FLAT__FLAT_LOAD_DWORDX3()
+    {
+    } // ~Inst_FLAT__FLAT_LOAD_DWORDX3
+
+    // --- description from .arch file ---
+    // Untyped buffer load 3 dwords.
+    void
+    Inst_FLAT__FLAT_LOAD_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_LOAD_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<3>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_LOAD_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
+        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 3];
+                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 3 + 1];
+                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 3 + 2];
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+        vdst2.write();
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_LOAD_DWORDX4 class methods ---
+
+    Inst_FLAT__FLAT_LOAD_DWORDX4::Inst_FLAT__FLAT_LOAD_DWORDX4(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_load_dwordx4")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_FLAT__FLAT_LOAD_DWORDX4
+
+    Inst_FLAT__FLAT_LOAD_DWORDX4::~Inst_FLAT__FLAT_LOAD_DWORDX4()
+    {
+    } // ~Inst_FLAT__FLAT_LOAD_DWORDX4
+
+    // --- description from .arch file ---
+    // Untyped buffer load 4 dwords.
+    void
+    Inst_FLAT__FLAT_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<4>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
+        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
+        VecOperandU32 vdst3(gpuDynInst, extData.VDST + 3);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4];
+                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 1];
+                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 2];
+                vdst3[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 3];
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+        vdst2.write();
+        vdst3.write();
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_STORE_BYTE class methods ---
+
+    Inst_FLAT__FLAT_STORE_BYTE::Inst_FLAT__FLAT_STORE_BYTE(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_store_byte")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_FLAT__FLAT_STORE_BYTE
+
+    Inst_FLAT__FLAT_STORE_BYTE::~Inst_FLAT__FLAT_STORE_BYTE()
+    {
+    } // ~Inst_FLAT__FLAT_STORE_BYTE
+
+    // --- description from .arch file ---
+    // Untyped buffer store byte.
+    void
+    Inst_FLAT__FLAT_STORE_BYTE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU8 data(gpuDynInst, extData.DATA);
+
+        data.read();
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU8*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_STORE_BYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<VecElemU8>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_STORE_BYTE::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_FLAT__FLAT_STORE_SHORT class methods ---
+
+    Inst_FLAT__FLAT_STORE_SHORT::Inst_FLAT__FLAT_STORE_SHORT(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_store_short")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_FLAT__FLAT_STORE_SHORT
+
+    Inst_FLAT__FLAT_STORE_SHORT::~Inst_FLAT__FLAT_STORE_SHORT()
+    {
+    } // ~Inst_FLAT__FLAT_STORE_SHORT
+
+    // --- description from .arch file ---
+    // Untyped buffer store short.
+    void
+    Inst_FLAT__FLAT_STORE_SHORT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU16 data(gpuDynInst, extData.DATA);
+
+        data.read();
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU16*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_STORE_SHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<VecElemU16>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_STORE_SHORT::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_STORE_SHORT_D16_HI class methods ---
+
+    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::
+        Inst_FLAT__FLAT_STORE_SHORT_D16_HI(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_store_short_d16_hi")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_FLAT__FLAT_STORE_SHORT_D16_HI
+
+    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::~Inst_FLAT__FLAT_STORE_SHORT_D16_HI()
+    {
+    } // ~Inst_FLAT__FLAT_STORE_SHORT_D16_HI
+
+    // --- description from .arch file ---
+    // Untyped buffer store short.
+    void
+    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 data(gpuDynInst, extData.DATA);
+
+        data.read();
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU16*>(gpuDynInst->d_data))[lane]
+                    = (data[lane] >> 16);
+            }
+        }
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<VecElemU16>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_STORE_DWORD class methods ---
+
+    Inst_FLAT__FLAT_STORE_DWORD::Inst_FLAT__FLAT_STORE_DWORD(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_store_dword")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_FLAT__FLAT_STORE_DWORD
+
+    Inst_FLAT__FLAT_STORE_DWORD::~Inst_FLAT__FLAT_STORE_DWORD()
+    {
+    } // ~Inst_FLAT__FLAT_STORE_DWORD
+
+    // --- description from .arch file ---
+    // Untyped buffer store dword.
+    void
+    Inst_FLAT__FLAT_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 data(gpuDynInst, extData.DATA);
+
+        data.read();
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_STORE_DWORDX2 class methods ---
+
+    Inst_FLAT__FLAT_STORE_DWORDX2::Inst_FLAT__FLAT_STORE_DWORDX2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_store_dwordx2")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_FLAT__FLAT_STORE_DWORDX2
+
+    Inst_FLAT__FLAT_STORE_DWORDX2::~Inst_FLAT__FLAT_STORE_DWORDX2()
+    {
+    } // ~Inst_FLAT__FLAT_STORE_DWORDX2
+
+    // --- description from .arch file ---
+    // Untyped buffer store 2 dwords.
+    void
+    Inst_FLAT__FLAT_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU64 data(gpuDynInst, extData.DATA);
+
+        data.read();
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU64*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_STORE_DWORDX3 class methods ---
+
+    Inst_FLAT__FLAT_STORE_DWORDX3::Inst_FLAT__FLAT_STORE_DWORDX3(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_store_dwordx3")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_FLAT__FLAT_STORE_DWORDX3
+
+    Inst_FLAT__FLAT_STORE_DWORDX3::~Inst_FLAT__FLAT_STORE_DWORDX3()
+    {
+    } // ~Inst_FLAT__FLAT_STORE_DWORDX3
+
+    // --- description from .arch file ---
+    // Untyped buffer store 3 dwords.
+    void
+    Inst_FLAT__FLAT_STORE_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 data0(gpuDynInst, extData.DATA);
+        ConstVecOperandU32 data1(gpuDynInst, extData.DATA + 1);
+        ConstVecOperandU32 data2(gpuDynInst, extData.DATA + 2);
+
+        data0.read();
+        data1.read();
+        data2.read();
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 3] = data0[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 3 + 1] = data1[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 3 + 2] = data2[lane];
+            }
+        }
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_STORE_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<3>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_STORE_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_STORE_DWORDX4 class methods ---
+
+    Inst_FLAT__FLAT_STORE_DWORDX4::Inst_FLAT__FLAT_STORE_DWORDX4(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_store_dwordx4")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_FLAT__FLAT_STORE_DWORDX4
+
+    Inst_FLAT__FLAT_STORE_DWORDX4::~Inst_FLAT__FLAT_STORE_DWORDX4()
+    {
+    } // ~Inst_FLAT__FLAT_STORE_DWORDX4
+
+    // --- description from .arch file ---
+    // Untyped buffer store 4 dwords.
+    void
+    Inst_FLAT__FLAT_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 data0(gpuDynInst, extData.DATA);
+        ConstVecOperandU32 data1(gpuDynInst, extData.DATA + 1);
+        ConstVecOperandU32 data2(gpuDynInst, extData.DATA + 2);
+        ConstVecOperandU32 data3(gpuDynInst, extData.DATA + 3);
+
+        data0.read();
+        data1.read();
+        data2.read();
+        data3.read();
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4] = data0[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 3] = data3[lane];
+            }
+        }
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<4>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_SWAP class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_SWAP::Inst_FLAT__FLAT_ATOMIC_SWAP(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_swap")
+    {
+        setFlag(AtomicExch);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_SWAP
+
+    Inst_FLAT__FLAT_ATOMIC_SWAP::~Inst_FLAT__FLAT_ATOMIC_SWAP()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_SWAP
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_SWAP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SWAP::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SWAP::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+
+    // --- Inst_FLAT__FLAT_ATOMIC_CMPSWAP class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP
+        ::Inst_FLAT__FLAT_ATOMIC_CMPSWAP(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_cmpswap")
+    {
+        setFlag(AtomicCAS);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_CMPSWAP
+
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::~Inst_FLAT__FLAT_ATOMIC_CMPSWAP()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_CMPSWAP
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // src = DATA[0];
+    // cmp = DATA[1];
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32, 1>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_ADD class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_ADD::Inst_FLAT__FLAT_ATOMIC_ADD(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_add")
+    {
+        setFlag(AtomicAdd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_ADD
+
+    Inst_FLAT__FLAT_ATOMIC_ADD::~Inst_FLAT__FLAT_ATOMIC_ADD()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_ADD
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] += DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_SUB class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_SUB::Inst_FLAT__FLAT_ATOMIC_SUB(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_sub")
+    {
+        setFlag(AtomicSub);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_SUB
+
+    Inst_FLAT__FLAT_ATOMIC_SUB::~Inst_FLAT__FLAT_ATOMIC_SUB()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_SUB
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SUB::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SUB::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_SMIN class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_SMIN::Inst_FLAT__FLAT_ATOMIC_SMIN(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_smin")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_SMIN
+
+    Inst_FLAT__FLAT_ATOMIC_SMIN::~Inst_FLAT__FLAT_ATOMIC_SMIN()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_SMIN
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMIN::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandI32, VecElemI32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMIN::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemI32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMIN::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandI32, VecElemI32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_UMIN class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_UMIN::Inst_FLAT__FLAT_ATOMIC_UMIN(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_umin")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_UMIN
+
+    Inst_FLAT__FLAT_ATOMIC_UMIN::~Inst_FLAT__FLAT_ATOMIC_UMIN()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_UMIN
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMIN::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMIN::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMIN::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_SMAX class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_SMAX::Inst_FLAT__FLAT_ATOMIC_SMAX(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_smax")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_SMAX
+
+    Inst_FLAT__FLAT_ATOMIC_SMAX::~Inst_FLAT__FLAT_ATOMIC_SMAX()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_SMAX
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMAX::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandI32, VecElemI32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMAX::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemI32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMAX::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandI32, VecElemI32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_UMAX class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_UMAX::Inst_FLAT__FLAT_ATOMIC_UMAX(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_umax")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_UMAX
+
+    Inst_FLAT__FLAT_ATOMIC_UMAX::~Inst_FLAT__FLAT_ATOMIC_UMAX()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_UMAX
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMAX::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMAX::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMAX::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_AND class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_AND::Inst_FLAT__FLAT_ATOMIC_AND(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_and")
+    {
+        setFlag(AtomicAnd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_AND
+
+    Inst_FLAT__FLAT_ATOMIC_AND::~Inst_FLAT__FLAT_ATOMIC_AND()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_AND
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] &= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_AND::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_AND::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_AND::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_OR class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_OR::Inst_FLAT__FLAT_ATOMIC_OR(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_or")
+    {
+        setFlag(AtomicOr);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_OR
+
+    Inst_FLAT__FLAT_ATOMIC_OR::~Inst_FLAT__FLAT_ATOMIC_OR()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_OR
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] |= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_OR::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_OR::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_OR::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+
+    // --- Inst_FLAT__FLAT_ATOMIC_XOR class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_XOR::Inst_FLAT__FLAT_ATOMIC_XOR(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_xor")
+    {
+        setFlag(AtomicXor);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_XOR
+
+    Inst_FLAT__FLAT_ATOMIC_XOR::~Inst_FLAT__FLAT_ATOMIC_XOR()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_XOR
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] ^= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_XOR::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_XOR::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_XOR::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_INC class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_INC::Inst_FLAT__FLAT_ATOMIC_INC(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_inc")
+    {
+        setFlag(AtomicInc);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_INC
+
+    Inst_FLAT__FLAT_ATOMIC_INC::~Inst_FLAT__FLAT_ATOMIC_INC()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_INC
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_INC::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_INC::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_DEC class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_DEC::Inst_FLAT__FLAT_ATOMIC_DEC(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_dec")
+    {
+        setFlag(AtomicDec);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_DEC
+
+    Inst_FLAT__FLAT_ATOMIC_DEC::~Inst_FLAT__FLAT_ATOMIC_DEC()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_DEC
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
+    // (unsigned compare); RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_DEC::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_DEC::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_SWAP_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::Inst_FLAT__FLAT_ATOMIC_SWAP_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_swap_x2")
+    {
+        setFlag(AtomicExch);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_SWAP_X2
+
+    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::~Inst_FLAT__FLAT_ATOMIC_SWAP_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_SWAP_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_cmpswap_x2")
+    {
+        setFlag(AtomicCAS);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2
+
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::~Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // src = DATA[0:1];
+    // cmp = DATA[2:3];
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64, 2>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_ADD_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_ADD_X2::Inst_FLAT__FLAT_ATOMIC_ADD_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_add_x2")
+    {
+        setFlag(AtomicAdd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_ADD_X2
+
+    Inst_FLAT__FLAT_ATOMIC_ADD_X2::~Inst_FLAT__FLAT_ATOMIC_ADD_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_ADD_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] += DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_SUB_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_SUB_X2::Inst_FLAT__FLAT_ATOMIC_SUB_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_sub_x2")
+    {
+        setFlag(AtomicSub);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_SUB_X2
+
+    Inst_FLAT__FLAT_ATOMIC_SUB_X2::~Inst_FLAT__FLAT_ATOMIC_SUB_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_SUB_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_SUB_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SUB_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SUB_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_SMIN_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::Inst_FLAT__FLAT_ATOMIC_SMIN_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_smin_x2")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_SMIN_X2
+
+    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::~Inst_FLAT__FLAT_ATOMIC_SMIN_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_SMIN_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandI64, VecElemI64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemI64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandI64, VecElemI64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_UMIN_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::Inst_FLAT__FLAT_ATOMIC_UMIN_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_umin_x2")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_UMIN_X2
+
+    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::~Inst_FLAT__FLAT_ATOMIC_UMIN_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_UMIN_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_SMAX_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::Inst_FLAT__FLAT_ATOMIC_SMAX_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_smax_x2")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_SMAX_X2
+
+    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::~Inst_FLAT__FLAT_ATOMIC_SMAX_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_SMAX_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandI64, VecElemI64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemI64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandI64, VecElemI64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_UMAX_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::Inst_FLAT__FLAT_ATOMIC_UMAX_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_umax_x2")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_UMAX_X2
+
+    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::~Inst_FLAT__FLAT_ATOMIC_UMAX_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_UMAX_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_AND_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_AND_X2::Inst_FLAT__FLAT_ATOMIC_AND_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_and_x2")
+    {
+        setFlag(AtomicAnd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_AND_X2
+
+    Inst_FLAT__FLAT_ATOMIC_AND_X2::~Inst_FLAT__FLAT_ATOMIC_AND_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_AND_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] &= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_AND_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_AND_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_AND_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_OR_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_OR_X2::Inst_FLAT__FLAT_ATOMIC_OR_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_or_x2")
+    {
+        setFlag(AtomicOr);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_OR_X2
+
+    Inst_FLAT__FLAT_ATOMIC_OR_X2::~Inst_FLAT__FLAT_ATOMIC_OR_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_OR_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] |= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_OR_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_OR_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_OR_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_XOR_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_XOR_X2::Inst_FLAT__FLAT_ATOMIC_XOR_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_xor_x2")
+    {
+        setFlag(AtomicXor);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_XOR_X2
+
+    Inst_FLAT__FLAT_ATOMIC_XOR_X2::~Inst_FLAT__FLAT_ATOMIC_XOR_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_XOR_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] ^= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_XOR_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_XOR_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_XOR_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_INC_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_INC_X2::Inst_FLAT__FLAT_ATOMIC_INC_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_inc_x2")
+    {
+        setFlag(AtomicInc);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_INC_X2
+
+    Inst_FLAT__FLAT_ATOMIC_INC_X2::~Inst_FLAT__FLAT_ATOMIC_INC_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_INC_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_INC_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_INC_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_INC_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_DEC_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_DEC_X2::Inst_FLAT__FLAT_ATOMIC_DEC_X2(
+        InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_dec_x2")
+    {
+        setFlag(AtomicDec);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_DEC_X2
+
+    Inst_FLAT__FLAT_ATOMIC_DEC_X2::~Inst_FLAT__FLAT_ATOMIC_DEC_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_DEC_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
+    // (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_DEC_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_DEC_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_DEC_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_ADD_F32 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_ADD_F32::Inst_FLAT__FLAT_ATOMIC_ADD_F32(
+        InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_add_f32")
+    {
+        setFlag(AtomicAdd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_ADD_F32
+
+    Inst_FLAT__FLAT_ATOMIC_ADD_F32::~Inst_FLAT__FLAT_ATOMIC_ADD_F32()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_ADD_F32
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandF32, VecElemF32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD_F32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemF32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD_F32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandF32, VecElemF32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16::Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16(
+        InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_pk_add_f16")
+    {
+        setFlag(AtomicAdd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16
+
+    Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16::~Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_ADD_F64 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_ADD_F64::Inst_FLAT__FLAT_ATOMIC_ADD_F64(
+        InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_add_f64")
+    {
+        setFlag(AtomicAdd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_ADD_F64
+
+    Inst_FLAT__FLAT_ATOMIC_ADD_F64::~Inst_FLAT__FLAT_ATOMIC_ADD_F64()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_ADD_F64
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandF64, VecElemF64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD_F64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemF64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD_F64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandF64, VecElemF64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_MIN_F64 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_MIN_F64::Inst_FLAT__FLAT_ATOMIC_MIN_F64(
+        InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_min_f64")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_MIN_F64
+
+    Inst_FLAT__FLAT_ATOMIC_MIN_F64::~Inst_FLAT__FLAT_ATOMIC_MIN_F64()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_MIN_F64
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_MIN_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandF64, VecElemF64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_MIN_F64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemF64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_MIN_F64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandF64, VecElemF64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_MAX_F64 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_MAX_F64::Inst_FLAT__FLAT_ATOMIC_MAX_F64(
+        InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_max_f64")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_MAX_F64
+
+    Inst_FLAT__FLAT_ATOMIC_MAX_F64::~Inst_FLAT__FLAT_ATOMIC_MAX_F64()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_MAX_F64
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_MAX_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandF64, VecElemF64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_MAX_F64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemF64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_MAX_F64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandF64, VecElemF64>(gpuDynInst);
+    } // completeAcc
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/inst_util.hh b/src/arch/amdgpu/vega/insts/inst_util.hh
index 7ec2e2ddd3..bc64ff88da 100644
--- a/src/arch/amdgpu/vega/insts/inst_util.hh
+++ b/src/arch/amdgpu/vega/insts/inst_util.hh
@@ -35,6 +35,7 @@
 #include <cmath>
 
 #include "arch/amdgpu/vega/gpu_registers.hh"
+#include "arch/amdgpu/vega/insts/gpu_static_inst.hh"
 
 namespace gem5
 {
@@ -315,7 +316,8 @@ namespace VegaISA
      * 0x142: broadcast 15th thread of each row to next row
      * 0x143: broadcast thread 31 to rows 2 and 3
      */
-    int dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum,
+    inline int
+    dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum,
                     int rowOffset, bool & outOfBounds)
     {
         // local variables
diff --git a/src/arch/amdgpu/vega/insts/instructions.cc b/src/arch/amdgpu/vega/insts/instructions.cc
deleted file mode 100644
index 651b6dc9f9..0000000000
--- a/src/arch/amdgpu/vega/insts/instructions.cc
+++ /dev/null
@@ -1,46539 +0,0 @@
-/*
- * Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "arch/amdgpu/vega/insts/instructions.hh"
-
-#include <cmath>
-
-#include "arch/amdgpu/vega/insts/inst_util.hh"
-#include "debug/VEGA.hh"
-#include "debug/GPUSync.hh"
-#include "dev/amdgpu/hwreg_defines.hh"
-#include "gpu-compute/shader.hh"
-
-namespace gem5
-{
-
-namespace VegaISA
-{
-    // --- Inst_SOP2__S_ADD_U32 class methods ---
-
-    Inst_SOP2__S_ADD_U32::Inst_SOP2__S_ADD_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_add_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ADD_U32
-
-    Inst_SOP2__S_ADD_U32::~Inst_SOP2__S_ADD_U32()
-    {
-    } // ~Inst_SOP2__S_ADD_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u + S1.u;
-    // SCC = (S0.u + S1.u >= 0x800000000ULL ? 1 : 0) is an unsigned
-    // ---  overflow/carry-out for S_ADDC_U32.
-    void
-    Inst_SOP2__S_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() + src1.rawData();
-        scc = ((ScalarRegU64)src0.rawData() + (ScalarRegU64)src1.rawData())
-            >= 0x100000000ULL ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_SUB_U32 class methods ---
-
-    Inst_SOP2__S_SUB_U32::Inst_SOP2__S_SUB_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_sub_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_SUB_U32
-
-    Inst_SOP2__S_SUB_U32::~Inst_SOP2__S_SUB_U32()
-    {
-    } // ~Inst_SOP2__S_SUB_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u - S1.u;
-    // SCC = (S1.u > S0.u ? 1 : 0) is an unsigned overflow or carry-out for
-    // ---  S_SUBB_U32.
-    void
-    Inst_SOP2__S_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() - src1.rawData();
-        scc = (src1.rawData() > src0.rawData()) ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_ADD_I32 class methods ---
-
-    Inst_SOP2__S_ADD_I32::Inst_SOP2__S_ADD_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_add_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ADD_I32
-
-    Inst_SOP2__S_ADD_I32::~Inst_SOP2__S_ADD_I32()
-    {
-    } // ~Inst_SOP2__S_ADD_I32
-
-    // --- description from .arch file ---
-    // D.i = S0.i + S1.i;
-    // SCC = (S0.u[31] == S1.u[31] && S0.u[31] != D.u[31]) is a signed
-    // overflow.
-    // This opcode is not suitable for use with S_ADDC_U32 for implementing
-    // 64-bit operations.
-    void
-    Inst_SOP2__S_ADD_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() + src1.rawData();
-        scc = (bits(src0.rawData(), 31) == bits(src1.rawData(), 31)
-            && bits(src0.rawData(), 31) != bits(sdst.rawData(), 31))
-            ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_SUB_I32 class methods ---
-
-    Inst_SOP2__S_SUB_I32::Inst_SOP2__S_SUB_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_sub_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_SUB_I32
-
-    Inst_SOP2__S_SUB_I32::~Inst_SOP2__S_SUB_I32()
-    {
-    } // ~Inst_SOP2__S_SUB_I32
-
-    // --- description from .arch file ---
-    // D.i = S0.i - S1.i;
-    // SCC = (S0.u[31] != S1.u[31] && S0.u[31] != D.u[31]) is a signed
-    // overflow.
-    // CAUTION: The condition code behaviour for this opcode is inconsistent
-    // with V_SUB_I32; see V_SUB_I32 for further details.
-    // This opcode is not suitable for use with S_SUBB_U32 for implementing
-    // 64-bit operations.
-    void
-    Inst_SOP2__S_SUB_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() - src1.rawData();
-        scc = (bits(src0.rawData(), 31) != bits(src1.rawData(), 31)
-            && bits(src0.rawData(), 31) != bits(sdst.rawData(), 31)) ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_ADDC_U32 class methods ---
-
-    Inst_SOP2__S_ADDC_U32::Inst_SOP2__S_ADDC_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_addc_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ADDC_U32
-
-    Inst_SOP2__S_ADDC_U32::~Inst_SOP2__S_ADDC_U32()
-    {
-    } // ~Inst_SOP2__S_ADDC_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u + S1.u + SCC;
-    // SCC = (S0.u + S1.u + SCC >= 0x800000000ULL ? 1 : 0) is an unsigned
-    // overflow.
-    void
-    Inst_SOP2__S_ADDC_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-        scc.read();
-
-        sdst = src0.rawData() + src1.rawData() + scc.rawData();
-        scc = ((ScalarRegU64)src0.rawData() + (ScalarRegU64)src1.rawData()
-            + (ScalarRegU64)scc.rawData()) >= 0x100000000ULL ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_SUBB_U32 class methods ---
-
-    Inst_SOP2__S_SUBB_U32::Inst_SOP2__S_SUBB_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_subb_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_SUBB_U32
-
-    Inst_SOP2__S_SUBB_U32::~Inst_SOP2__S_SUBB_U32()
-    {
-    } // ~Inst_SOP2__S_SUBB_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u - S1.u - SCC;
-    // SCC = (S1.u + SCC > S0.u ? 1 : 0) is an unsigned overflow.
-    void
-    Inst_SOP2__S_SUBB_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-        scc.read();
-
-        sdst = src0.rawData() - src1.rawData() - scc.rawData();
-        scc = (src1.rawData() + scc.rawData()) > src0.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_MIN_I32 class methods ---
-
-    Inst_SOP2__S_MIN_I32::Inst_SOP2__S_MIN_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_min_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_MIN_I32
-
-    Inst_SOP2__S_MIN_I32::~Inst_SOP2__S_MIN_I32()
-    {
-    } // ~Inst_SOP2__S_MIN_I32
-
-    // --- description from .arch file ---
-    // D.i = (S0.i < S1.i) ? S0.i : S1.i;
-    // SCC = 1 if S0 is chosen as the minimum value.
-    void
-    Inst_SOP2__S_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = std::min(src0.rawData(), src1.rawData());
-        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_MIN_U32 class methods ---
-
-    Inst_SOP2__S_MIN_U32::Inst_SOP2__S_MIN_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_min_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_MIN_U32
-
-    Inst_SOP2__S_MIN_U32::~Inst_SOP2__S_MIN_U32()
-    {
-    } // ~Inst_SOP2__S_MIN_U32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u < S1.u) ? S0.u : S1.u;
-    // SCC = 1 if S0 is chosen as the minimum value.
-    void
-    Inst_SOP2__S_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = std::min(src0.rawData(), src1.rawData());
-        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_MAX_I32 class methods ---
-
-    Inst_SOP2__S_MAX_I32::Inst_SOP2__S_MAX_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_max_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_MAX_I32
-
-    Inst_SOP2__S_MAX_I32::~Inst_SOP2__S_MAX_I32()
-    {
-    } // ~Inst_SOP2__S_MAX_I32
-
-    // --- description from .arch file ---
-    // D.i = (S0.i > S1.i) ? S0.i : S1.i;
-    // SCC = 1 if S0 is chosen as the maximum value.
-    void
-    Inst_SOP2__S_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = std::max(src0.rawData(), src1.rawData());
-        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_MAX_U32 class methods ---
-
-    Inst_SOP2__S_MAX_U32::Inst_SOP2__S_MAX_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_max_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_MAX_U32
-
-    Inst_SOP2__S_MAX_U32::~Inst_SOP2__S_MAX_U32()
-    {
-    } // ~Inst_SOP2__S_MAX_U32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u > S1.u) ? S0.u : S1.u;
-    // SCC = 1 if S0 is chosen as the maximum value.
-    void
-    Inst_SOP2__S_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = std::max(src0.rawData(), src1.rawData());
-        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_CSELECT_B32 class methods ---
-
-    Inst_SOP2__S_CSELECT_B32::Inst_SOP2__S_CSELECT_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_cselect_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_CSELECT_B32
-
-    Inst_SOP2__S_CSELECT_B32::~Inst_SOP2__S_CSELECT_B32()
-    {
-    } // ~Inst_SOP2__S_CSELECT_B32
-
-    // --- description from .arch file ---
-    // D.u = SCC ? S0.u : S1.u (conditional select).
-    void
-    Inst_SOP2__S_CSELECT_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-        scc.read();
-
-        sdst = scc.rawData() ? src0.rawData() : src1.rawData();
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP2__S_CSELECT_B64 class methods ---
-
-    Inst_SOP2__S_CSELECT_B64::Inst_SOP2__S_CSELECT_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_cselect_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_CSELECT_B64
-
-    Inst_SOP2__S_CSELECT_B64::~Inst_SOP2__S_CSELECT_B64()
-    {
-    } // ~Inst_SOP2__S_CSELECT_B64
-
-    // --- description from .arch file ---
-    // D.u64 = SCC ? S0.u64 : S1.u64 (conditional select).
-    void
-    Inst_SOP2__S_CSELECT_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-        scc.read();
-
-        sdst = scc.rawData() ? src0.rawData() : src1.rawData();
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP2__S_AND_B32 class methods ---
-
-    Inst_SOP2__S_AND_B32::Inst_SOP2__S_AND_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_and_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_AND_B32
-
-    Inst_SOP2__S_AND_B32::~Inst_SOP2__S_AND_B32()
-    {
-    } // ~Inst_SOP2__S_AND_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u & S1.u;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_AND_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() & src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_AND_B64 class methods ---
-
-    Inst_SOP2__S_AND_B64::Inst_SOP2__S_AND_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_and_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_AND_B64
-
-    Inst_SOP2__S_AND_B64::~Inst_SOP2__S_AND_B64()
-    {
-    } // ~Inst_SOP2__S_AND_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S0.u64 & S1.u64;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_AND_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() & src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_OR_B32 class methods ---
-
-    Inst_SOP2__S_OR_B32::Inst_SOP2__S_OR_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_or_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_OR_B32
-
-    Inst_SOP2__S_OR_B32::~Inst_SOP2__S_OR_B32()
-    {
-    } // ~Inst_SOP2__S_OR_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u | S1.u;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_OR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() | src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_OR_B64 class methods ---
-
-    Inst_SOP2__S_OR_B64::Inst_SOP2__S_OR_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_or_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_OR_B64
-
-    Inst_SOP2__S_OR_B64::~Inst_SOP2__S_OR_B64()
-    {
-    } // ~Inst_SOP2__S_OR_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S0.u64 | S1.u64;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_OR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() | src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_XOR_B32 class methods ---
-
-    Inst_SOP2__S_XOR_B32::Inst_SOP2__S_XOR_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_xor_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_XOR_B32
-
-    Inst_SOP2__S_XOR_B32::~Inst_SOP2__S_XOR_B32()
-    {
-    } // ~Inst_SOP2__S_XOR_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u ^ S1.u;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() ^ src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_XOR_B64 class methods ---
-
-    Inst_SOP2__S_XOR_B64::Inst_SOP2__S_XOR_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_xor_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_XOR_B64
-
-    Inst_SOP2__S_XOR_B64::~Inst_SOP2__S_XOR_B64()
-    {
-    } // ~Inst_SOP2__S_XOR_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S0.u64 ^ S1.u64;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_XOR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() ^ src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_ANDN2_B32 class methods ---
-
-    Inst_SOP2__S_ANDN2_B32::Inst_SOP2__S_ANDN2_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_andn2_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ANDN2_B32
-
-    Inst_SOP2__S_ANDN2_B32::~Inst_SOP2__S_ANDN2_B32()
-    {
-    } // ~Inst_SOP2__S_ANDN2_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u & ~S1.u;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_ANDN2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() &~ src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_ANDN2_B64 class methods ---
-
-    Inst_SOP2__S_ANDN2_B64::Inst_SOP2__S_ANDN2_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_andn2_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ANDN2_B64
-
-    Inst_SOP2__S_ANDN2_B64::~Inst_SOP2__S_ANDN2_B64()
-    {
-    } // ~Inst_SOP2__S_ANDN2_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S0.u64 & ~S1.u64;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_ANDN2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() &~ src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_ORN2_B32 class methods ---
-
-    Inst_SOP2__S_ORN2_B32::Inst_SOP2__S_ORN2_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_orn2_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ORN2_B32
-
-    Inst_SOP2__S_ORN2_B32::~Inst_SOP2__S_ORN2_B32()
-    {
-    } // ~Inst_SOP2__S_ORN2_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u | ~S1.u;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_ORN2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() |~ src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_ORN2_B64 class methods ---
-
-    Inst_SOP2__S_ORN2_B64::Inst_SOP2__S_ORN2_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_orn2_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ORN2_B64
-
-    Inst_SOP2__S_ORN2_B64::~Inst_SOP2__S_ORN2_B64()
-    {
-    } // ~Inst_SOP2__S_ORN2_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S0.u64 | ~S1.u64;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_ORN2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() |~ src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_NAND_B32 class methods ---
-
-    Inst_SOP2__S_NAND_B32::Inst_SOP2__S_NAND_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_nand_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_NAND_B32
-
-    Inst_SOP2__S_NAND_B32::~Inst_SOP2__S_NAND_B32()
-    {
-    } // ~Inst_SOP2__S_NAND_B32
-
-    // --- description from .arch file ---
-    // D.u = ~(S0.u & S1.u);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_NAND_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = ~(src0.rawData() & src1.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_NAND_B64 class methods ---
-
-    Inst_SOP2__S_NAND_B64::Inst_SOP2__S_NAND_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_nand_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_NAND_B64
-
-    Inst_SOP2__S_NAND_B64::~Inst_SOP2__S_NAND_B64()
-    {
-    } // ~Inst_SOP2__S_NAND_B64
-
-    // --- description from .arch file ---
-    // D.u64 = ~(S0.u64 & S1.u64);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_NAND_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = ~(src0.rawData() & src1.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_NOR_B32 class methods ---
-
-    Inst_SOP2__S_NOR_B32::Inst_SOP2__S_NOR_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_nor_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_NOR_B32
-
-    Inst_SOP2__S_NOR_B32::~Inst_SOP2__S_NOR_B32()
-    {
-    } // ~Inst_SOP2__S_NOR_B32
-
-    // --- description from .arch file ---
-    // D.u = ~(S0.u | S1.u);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_NOR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = ~(src0.rawData() | src1.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_NOR_B64 class methods ---
-
-    Inst_SOP2__S_NOR_B64::Inst_SOP2__S_NOR_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_nor_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_NOR_B64
-
-    Inst_SOP2__S_NOR_B64::~Inst_SOP2__S_NOR_B64()
-    {
-    } // ~Inst_SOP2__S_NOR_B64
-
-    // --- description from .arch file ---
-    // D.u64 = ~(S0.u64 | S1.u64);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_NOR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = ~(src0.rawData() | src1.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_XNOR_B32 class methods ---
-
-    Inst_SOP2__S_XNOR_B32::Inst_SOP2__S_XNOR_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_xnor_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_XNOR_B32
-
-    Inst_SOP2__S_XNOR_B32::~Inst_SOP2__S_XNOR_B32()
-    {
-    } // ~Inst_SOP2__S_XNOR_B32
-
-    // --- description from .arch file ---
-    // D.u = ~(S0.u ^ S1.u);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_XNOR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = ~(src0.rawData() ^ src1.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_XNOR_B64 class methods ---
-
-    Inst_SOP2__S_XNOR_B64::Inst_SOP2__S_XNOR_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_xnor_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_XNOR_B64
-
-    Inst_SOP2__S_XNOR_B64::~Inst_SOP2__S_XNOR_B64()
-    {
-    } // ~Inst_SOP2__S_XNOR_B64
-
-    // --- description from .arch file ---
-    // D.u64 = ~(S0.u64 ^ S1.u64);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_XNOR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = ~(src0.rawData() ^ src1.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_LSHL_B32 class methods ---
-
-    Inst_SOP2__S_LSHL_B32::Inst_SOP2__S_LSHL_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_lshl_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_LSHL_B32
-
-    Inst_SOP2__S_LSHL_B32::~Inst_SOP2__S_LSHL_B32()
-    {
-    } // ~Inst_SOP2__S_LSHL_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u << S1.u[4:0];
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_LSHL_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() << bits(src1.rawData(), 4, 0));
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_LSHL_B64 class methods ---
-
-    Inst_SOP2__S_LSHL_B64::Inst_SOP2__S_LSHL_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_lshl_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_LSHL_B64
-
-    Inst_SOP2__S_LSHL_B64::~Inst_SOP2__S_LSHL_B64()
-    {
-    } // ~Inst_SOP2__S_LSHL_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S0.u64 << S1.u[5:0];
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_LSHL_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() << bits(src1.rawData(), 5, 0));
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_LSHR_B32 class methods ---
-
-    Inst_SOP2__S_LSHR_B32::Inst_SOP2__S_LSHR_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_lshr_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_LSHR_B32
-
-    Inst_SOP2__S_LSHR_B32::~Inst_SOP2__S_LSHR_B32()
-    {
-    } // ~Inst_SOP2__S_LSHR_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u >> S1.u[4:0];
-    // SCC = 1 if result is non-zero.
-    // The vacated bits are set to zero.
-    void
-    Inst_SOP2__S_LSHR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0));
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_LSHR_B64 class methods ---
-
-    Inst_SOP2__S_LSHR_B64::Inst_SOP2__S_LSHR_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_lshr_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_LSHR_B64
-
-    Inst_SOP2__S_LSHR_B64::~Inst_SOP2__S_LSHR_B64()
-    {
-    } // ~Inst_SOP2__S_LSHR_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S0.u64 >> S1.u[5:0];
-    // SCC = 1 if result is non-zero.
-    // The vacated bits are set to zero.
-    void
-    Inst_SOP2__S_LSHR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0));
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_ASHR_I32 class methods ---
-
-    Inst_SOP2__S_ASHR_I32::Inst_SOP2__S_ASHR_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_ashr_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ASHR_I32
-
-    Inst_SOP2__S_ASHR_I32::~Inst_SOP2__S_ASHR_I32()
-    {
-    } // ~Inst_SOP2__S_ASHR_I32
-
-    // --- description from .arch file ---
-    // D.i = signext(S0.i) >> S1.u[4:0];
-    // SCC = 1 if result is non-zero.
-    // The vacated bits are set to the sign bit of the input value.
-    void
-    Inst_SOP2__S_ASHR_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0));
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_ASHR_I64 class methods ---
-
-    Inst_SOP2__S_ASHR_I64::Inst_SOP2__S_ASHR_I64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_ashr_i64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ASHR_I64
-
-    Inst_SOP2__S_ASHR_I64::~Inst_SOP2__S_ASHR_I64()
-    {
-    } // ~Inst_SOP2__S_ASHR_I64
-
-    // --- description from .arch file ---
-    // D.i64 = signext(S0.i64) >> S1.u[5:0];
-    // SCC = 1 if result is non-zero.
-    // The vacated bits are set to the sign bit of the input value.
-    void
-    Inst_SOP2__S_ASHR_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0));
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_BFM_B32 class methods ---
-
-    Inst_SOP2__S_BFM_B32::Inst_SOP2__S_BFM_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_bfm_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_BFM_B32
-
-    Inst_SOP2__S_BFM_B32::~Inst_SOP2__S_BFM_B32()
-    {
-    } // ~Inst_SOP2__S_BFM_B32
-
-    // --- description from .arch file ---
-    // D.u = ((1 << S0.u[4:0]) - 1) << S1.u[4:0] (bitfield mask).
-    void
-    Inst_SOP2__S_BFM_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-
-        src0.read();
-        src1.read();
-
-        sdst = ((1 << bits(src0.rawData(), 4, 0)) - 1)
-            << bits(src1.rawData(), 4, 0);
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP2__S_BFM_B64 class methods ---
-
-    Inst_SOP2__S_BFM_B64::Inst_SOP2__S_BFM_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_bfm_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_BFM_B64
-
-    Inst_SOP2__S_BFM_B64::~Inst_SOP2__S_BFM_B64()
-    {
-    } // ~Inst_SOP2__S_BFM_B64
-
-    // --- description from .arch file ---
-    // D.u64 = ((1ULL << S0.u[5:0]) - 1) << S1.u[5:0] (bitfield mask).
-    void
-    Inst_SOP2__S_BFM_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src0.read();
-        src1.read();
-
-        sdst = ((1ULL << bits(src0.rawData(), 5, 0)) - 1)
-            << bits(src1.rawData(), 5, 0);
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP2__S_MUL_I32 class methods ---
-
-    Inst_SOP2__S_MUL_I32::Inst_SOP2__S_MUL_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_mul_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_MUL_I32
-
-    Inst_SOP2__S_MUL_I32::~Inst_SOP2__S_MUL_I32()
-    {
-    } // ~Inst_SOP2__S_MUL_I32
-
-    // --- description from .arch file ---
-    // D.i = S0.i * S1.i.
-    void
-    Inst_SOP2__S_MUL_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() * src1.rawData();
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP2__S_BFE_U32 class methods ---
-
-    Inst_SOP2__S_BFE_U32::Inst_SOP2__S_BFE_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_bfe_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_BFE_U32
-
-    Inst_SOP2__S_BFE_U32::~Inst_SOP2__S_BFE_U32()
-    {
-    } // ~Inst_SOP2__S_BFE_U32
-
-    // --- description from .arch file ---
-    // Bit field extract. S0 is Data, S1[4:0] is field offset, S1[22:16] is
-    // field width.
-    // D.u = (S0.u>>S1.u[4:0]) & ((1<<S1.u[22:16])-1);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_BFE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0))
-            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_BFE_I32 class methods ---
-
-    Inst_SOP2__S_BFE_I32::Inst_SOP2__S_BFE_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_bfe_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_BFE_I32
-
-    Inst_SOP2__S_BFE_I32::~Inst_SOP2__S_BFE_I32()
-    {
-    } // ~Inst_SOP2__S_BFE_I32
-
-    // --- description from .arch file ---
-    // Bit field extract. S0 is Data, S1[4:0] is field offset, S1[22:16] is
-    // field width.
-    // D.i = (S0.i>>S1.u[4:0]) & ((1<<S1.u[22:16])-1);
-    // Sign-extend the result;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_BFE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0))
-            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
-
-        // Above extracted a signed int of size src1[22:16] bits which needs
-        // to be signed-extended. Check if the MSB of our src1[22:16]-bit
-        // integer is 1, and sign extend it is.
-        //
-        // Note: The description in the Vega ISA manual does not mention to
-        // sign-extend the result. An update description can be found in the
-        // more recent RDNA3 manual here:
-        // https://developer.amd.com/wp-content/resources/
-        //      RDNA3_Shader_ISA_December2022.pdf
-        if (sdst.rawData() >> (bits(src1.rawData(), 22, 16) - 1)) {
-            sdst = sdst.rawData()
-                 | (0xffffffff << bits(src1.rawData(), 22, 16));
-        }
-
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_BFE_U64 class methods ---
-
-    Inst_SOP2__S_BFE_U64::Inst_SOP2__S_BFE_U64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_bfe_u64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_BFE_U64
-
-    Inst_SOP2__S_BFE_U64::~Inst_SOP2__S_BFE_U64()
-    {
-    } // ~Inst_SOP2__S_BFE_U64
-
-    // --- description from .arch file ---
-    // Bit field extract. S0 is Data, S1[5:0] is field offset, S1[22:16] is
-    // field width.
-    // D.u64 = (S0.u64>>S1.u[5:0]) & ((1<<S1.u[22:16])-1);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_BFE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0))
-            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_BFE_I64 class methods ---
-
-    Inst_SOP2__S_BFE_I64::Inst_SOP2__S_BFE_I64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_bfe_i64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_BFE_I64
-
-    Inst_SOP2__S_BFE_I64::~Inst_SOP2__S_BFE_I64()
-    {
-    } // ~Inst_SOP2__S_BFE_I64
-
-    // --- description from .arch file ---
-    // Bit field extract. S0 is Data, S1[5:0] is field offset, S1[22:16] is
-    // field width.
-    // D.i64 = (S0.i64>>S1.u[5:0]) & ((1<<S1.u[22:16])-1);
-    // Sign-extend result;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_BFE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0))
-            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
-
-        // Above extracted a signed int of size src1[22:16] bits which needs
-        // to be signed-extended. Check if the MSB of our src1[22:16]-bit
-        // integer is 1, and sign extend it is.
-        if (sdst.rawData() >> (bits(src1.rawData(), 22, 16) - 1)) {
-            sdst = sdst.rawData()
-                 | 0xffffffffffffffff << bits(src1.rawData(), 22, 16);
-        }
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_CBRANCH_G_FORK class methods ---
-
-    Inst_SOP2__S_CBRANCH_G_FORK::Inst_SOP2__S_CBRANCH_G_FORK(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_cbranch_g_fork")
-    {
-        setFlag(Branch);
-    } // Inst_SOP2__S_CBRANCH_G_FORK
-
-    Inst_SOP2__S_CBRANCH_G_FORK::~Inst_SOP2__S_CBRANCH_G_FORK()
-    {
-    } // ~Inst_SOP2__S_CBRANCH_G_FORK
-
-    // --- description from .arch file ---
-    // mask_pass = S0.u64 & EXEC;
-    // mask_fail = ~S0.u64 & EXEC;
-    // if(mask_pass == EXEC)
-    //     PC = S1.u64;
-    // elsif(mask_fail == EXEC)
-    //     PC += 4;
-    // elsif(bitcount(mask_fail) < bitcount(mask_pass))
-    //     EXEC = mask_fail;
-    //     SGPR[CSP*4] = { S1.u64, mask_pass };
-    //     CSP++;
-    //     PC += 4;
-    // else
-    //     EXEC = mask_pass;
-    //     SGPR[CSP*4] = { PC + 4, mask_fail };
-    //     CSP++;
-    //     PC = S1.u64;
-    // end.
-    // Conditional branch using branch-stack.
-    // S0 = compare mask(vcc or any sgpr) and
-    // S1 = 64-bit byte address of target instruction.
-    // See also S_CBRANCH_JOIN.
-    void
-    Inst_SOP2__S_CBRANCH_G_FORK::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOP2__S_ABSDIFF_I32 class methods ---
-
-    Inst_SOP2__S_ABSDIFF_I32::Inst_SOP2__S_ABSDIFF_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_absdiff_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ABSDIFF_I32
-
-    Inst_SOP2__S_ABSDIFF_I32::~Inst_SOP2__S_ABSDIFF_I32()
-    {
-    } // ~Inst_SOP2__S_ABSDIFF_I32
-
-    // --- description from .arch file ---
-    // D.i = S0.i - S1.i;
-    // if(D.i < 0) then D.i = -D.i;
-    // SCC = 1 if result is non-zero.
-    // Compute the absolute value of difference between two values.
-    void
-    Inst_SOP2__S_ABSDIFF_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        sdst = std::abs(src0.rawData() - src1.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_RFE_RESTORE_B64 class methods ---
-
-    Inst_SOP2__S_RFE_RESTORE_B64::Inst_SOP2__S_RFE_RESTORE_B64(
-          InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_rfe_restore_b64")
-    {
-    } // Inst_SOP2__S_RFE_RESTORE_B64
-
-    Inst_SOP2__S_RFE_RESTORE_B64::~Inst_SOP2__S_RFE_RESTORE_B64()
-    {
-    } // ~Inst_SOP2__S_RFE_RESTORE_B64
-
-    // --- description from .arch file ---
-    // PRIV = 0;
-    // PC = S0.u64;
-    // INST_ATC = S1.u32[0].
-    // Return from exception handler and continue, possibly changing the
-    // ---  instruction ATC mode.
-    // This instruction may only be used within a trap handler.
-    // Use this instruction when the main program may be in a different memory
-    // ---  space than the trap handler.
-    void
-    Inst_SOP2__S_RFE_RESTORE_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOP2__S_MUL_HI_U32 class methods ---
-
-    Inst_SOP2__S_MUL_HI_U32::Inst_SOP2__S_MUL_HI_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_mul_hi_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_MUL_HI_U32
-
-    Inst_SOP2__S_MUL_HI_U32::~Inst_SOP2__S_MUL_HI_U32()
-    {
-    } // ~Inst_SOP2__S_MUL_HI_U32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u * S1.u) >> 32;
-    void
-    Inst_SOP2__S_MUL_HI_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-
-        src0.read();
-        src1.read();
-
-        VecElemU64 tmp_dst =
-            ((VecElemU64)src0.rawData() * (VecElemU64)src1.rawData());
-        sdst = (tmp_dst >> 32);
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP2__S_MUL_HI_I32 class methods ---
-
-    Inst_SOP2__S_MUL_HI_I32::Inst_SOP2__S_MUL_HI_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_mul_hi_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_MUL_HI_I32
-
-    Inst_SOP2__S_MUL_HI_I32::~Inst_SOP2__S_MUL_HI_I32()
-    {
-    } // ~Inst_SOP2__S_MUL_HI_I32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u * S1.u) >> 32;
-    void
-    Inst_SOP2__S_MUL_HI_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src0.read();
-        src1.read();
-
-        VecElemI64 tmp_src0 =
-            sext<std::numeric_limits<VecElemI64>::digits>(src0.rawData());
-        VecElemI64 tmp_src1 =
-            sext<std::numeric_limits<VecElemI64>::digits>(src1.rawData());
-        sdst = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOPK__S_MOVK_I32 class methods ---
-
-    Inst_SOPK__S_MOVK_I32::Inst_SOPK__S_MOVK_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_movk_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_MOVK_I32
-
-    Inst_SOPK__S_MOVK_I32::~Inst_SOPK__S_MOVK_I32()
-    {
-    } // ~Inst_SOPK__S_MOVK_I32
-
-    // --- description from .arch file ---
-    // D.i = signext(SIMM16) (sign extension).
-    void
-    Inst_SOPK__S_MOVK_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        sdst = simm16;
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOPK__S_CMOVK_I32 class methods ---
-
-    Inst_SOPK__S_CMOVK_I32::Inst_SOPK__S_CMOVK_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmovk_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMOVK_I32
-
-    Inst_SOPK__S_CMOVK_I32::~Inst_SOPK__S_CMOVK_I32()
-    {
-    } // ~Inst_SOPK__S_CMOVK_I32
-
-    // --- description from .arch file ---
-    // if(SCC) then D.i = signext(SIMM16);
-    // else NOP.
-    // Conditional move with sign extension.
-    void
-    Inst_SOPK__S_CMOVK_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        scc.read();
-
-        if (scc.rawData()) {
-            sdst = simm16;
-            sdst.write();
-        }
-    } // execute
-    // --- Inst_SOPK__S_CMPK_EQ_I32 class methods ---
-
-    Inst_SOPK__S_CMPK_EQ_I32::Inst_SOPK__S_CMPK_EQ_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_eq_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_EQ_I32
-
-    Inst_SOPK__S_CMPK_EQ_I32::~Inst_SOPK__S_CMPK_EQ_I32()
-    {
-    } // ~Inst_SOPK__S_CMPK_EQ_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i == signext(SIMM16)).
-    void
-    Inst_SOPK__S_CMPK_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() == simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_LG_I32 class methods ---
-
-    Inst_SOPK__S_CMPK_LG_I32::Inst_SOPK__S_CMPK_LG_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_lg_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_LG_I32
-
-    Inst_SOPK__S_CMPK_LG_I32::~Inst_SOPK__S_CMPK_LG_I32()
-    {
-    } // ~Inst_SOPK__S_CMPK_LG_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i != signext(SIMM16)).
-    void
-    Inst_SOPK__S_CMPK_LG_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() != simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_GT_I32 class methods ---
-
-    Inst_SOPK__S_CMPK_GT_I32::Inst_SOPK__S_CMPK_GT_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_gt_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_GT_I32
-
-    Inst_SOPK__S_CMPK_GT_I32::~Inst_SOPK__S_CMPK_GT_I32()
-    {
-    } // ~Inst_SOPK__S_CMPK_GT_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i > signext(SIMM16)).
-    void
-    Inst_SOPK__S_CMPK_GT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() > simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_GE_I32 class methods ---
-
-    Inst_SOPK__S_CMPK_GE_I32::Inst_SOPK__S_CMPK_GE_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_ge_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_GE_I32
-
-    Inst_SOPK__S_CMPK_GE_I32::~Inst_SOPK__S_CMPK_GE_I32()
-    {
-    } // ~Inst_SOPK__S_CMPK_GE_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i >= signext(SIMM16)).
-    void
-    Inst_SOPK__S_CMPK_GE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() >= simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_LT_I32 class methods ---
-
-    Inst_SOPK__S_CMPK_LT_I32::Inst_SOPK__S_CMPK_LT_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_lt_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_LT_I32
-
-    Inst_SOPK__S_CMPK_LT_I32::~Inst_SOPK__S_CMPK_LT_I32()
-    {
-    } // ~Inst_SOPK__S_CMPK_LT_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i < signext(SIMM16)).
-    void
-    Inst_SOPK__S_CMPK_LT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() < simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_LE_I32 class methods ---
-
-    Inst_SOPK__S_CMPK_LE_I32::Inst_SOPK__S_CMPK_LE_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_le_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_LE_I32
-
-    Inst_SOPK__S_CMPK_LE_I32::~Inst_SOPK__S_CMPK_LE_I32()
-    {
-    } // ~Inst_SOPK__S_CMPK_LE_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i <= signext(SIMM16)).
-    void
-    Inst_SOPK__S_CMPK_LE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() <= simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_EQ_U32 class methods ---
-
-    Inst_SOPK__S_CMPK_EQ_U32::Inst_SOPK__S_CMPK_EQ_U32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_eq_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_EQ_U32
-
-    Inst_SOPK__S_CMPK_EQ_U32::~Inst_SOPK__S_CMPK_EQ_U32()
-    {
-    } // ~Inst_SOPK__S_CMPK_EQ_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u == SIMM16).
-    void
-    Inst_SOPK__S_CMPK_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
-        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() == simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_LG_U32 class methods ---
-
-    Inst_SOPK__S_CMPK_LG_U32::Inst_SOPK__S_CMPK_LG_U32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_lg_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_LG_U32
-
-    Inst_SOPK__S_CMPK_LG_U32::~Inst_SOPK__S_CMPK_LG_U32()
-    {
-    } // ~Inst_SOPK__S_CMPK_LG_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u != SIMM16).
-    void
-    Inst_SOPK__S_CMPK_LG_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
-        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() != simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_GT_U32 class methods ---
-
-    Inst_SOPK__S_CMPK_GT_U32::Inst_SOPK__S_CMPK_GT_U32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_gt_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_GT_U32
-
-    Inst_SOPK__S_CMPK_GT_U32::~Inst_SOPK__S_CMPK_GT_U32()
-    {
-    } // ~Inst_SOPK__S_CMPK_GT_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u > SIMM16).
-    void
-    Inst_SOPK__S_CMPK_GT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
-        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() > simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_GE_U32 class methods ---
-
-    Inst_SOPK__S_CMPK_GE_U32::Inst_SOPK__S_CMPK_GE_U32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_ge_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_GE_U32
-
-    Inst_SOPK__S_CMPK_GE_U32::~Inst_SOPK__S_CMPK_GE_U32()
-    {
-    } // ~Inst_SOPK__S_CMPK_GE_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u >= SIMM16).
-    void
-    Inst_SOPK__S_CMPK_GE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
-        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() >= simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_LT_U32 class methods ---
-
-    Inst_SOPK__S_CMPK_LT_U32::Inst_SOPK__S_CMPK_LT_U32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_lt_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_LT_U32
-
-    Inst_SOPK__S_CMPK_LT_U32::~Inst_SOPK__S_CMPK_LT_U32()
-    {
-    } // ~Inst_SOPK__S_CMPK_LT_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u < SIMM16).
-    void
-    Inst_SOPK__S_CMPK_LT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
-        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() < simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_LE_U32 class methods ---
-
-    Inst_SOPK__S_CMPK_LE_U32::Inst_SOPK__S_CMPK_LE_U32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_le_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_LE_U32
-
-    Inst_SOPK__S_CMPK_LE_U32::~Inst_SOPK__S_CMPK_LE_U32()
-    {
-    } // ~Inst_SOPK__S_CMPK_LE_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u <= SIMM16).
-    void
-    Inst_SOPK__S_CMPK_LE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
-        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() <= simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_ADDK_I32 class methods ---
-
-    Inst_SOPK__S_ADDK_I32::Inst_SOPK__S_ADDK_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_addk_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_ADDK_I32
-
-    Inst_SOPK__S_ADDK_I32::~Inst_SOPK__S_ADDK_I32()
-    {
-    } // ~Inst_SOPK__S_ADDK_I32
-
-    // --- description from .arch file ---
-    // D.i = D.i + signext(SIMM16);
-    // SCC = overflow.
-    void
-    Inst_SOPK__S_ADDK_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI16 simm16 = instData.SIMM16;
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = src.rawData() + (ScalarRegI32)sext<16>(simm16);
-        scc = (bits(src.rawData(), 31) == bits(simm16, 15)
-            && bits(src.rawData(), 31) != bits(sdst.rawData(), 31)) ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_MULK_I32 class methods ---
-
-    Inst_SOPK__S_MULK_I32::Inst_SOPK__S_MULK_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_mulk_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_MULK_I32
-
-    Inst_SOPK__S_MULK_I32::~Inst_SOPK__S_MULK_I32()
-    {
-    } // ~Inst_SOPK__S_MULK_I32
-
-    // --- description from .arch file ---
-    // D.i = D.i * signext(SIMM16).
-    void
-    Inst_SOPK__S_MULK_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI16 simm16 = instData.SIMM16;
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = src.rawData() * (ScalarRegI32)sext<16>(simm16);
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOPK__S_CBRANCH_I_FORK class methods ---
-
-    Inst_SOPK__S_CBRANCH_I_FORK::Inst_SOPK__S_CBRANCH_I_FORK(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cbranch_i_fork")
-    {
-        setFlag(Branch);
-    } // Inst_SOPK__S_CBRANCH_I_FORK
-
-    Inst_SOPK__S_CBRANCH_I_FORK::~Inst_SOPK__S_CBRANCH_I_FORK()
-    {
-    } // ~Inst_SOPK__S_CBRANCH_I_FORK
-
-    // --- description from .arch file ---
-    // mask_pass = S0.u64 & EXEC;
-    // mask_fail = ~S0.u64 & EXEC;
-    // target_addr = PC + signext(SIMM16 * 4) + 4;
-    // if(mask_pass == EXEC)
-    //     PC = target_addr;
-    // elsif(mask_fail == EXEC)
-    //     PC += 4;
-    // elsif(bitcount(mask_fail) < bitcount(mask_pass))
-    //     EXEC = mask_fail;
-    //     SGPR[CSP*4] = { target_addr, mask_pass };
-    //     CSP++;
-    //     PC += 4;
-    // else
-    //     EXEC = mask_pass;
-    //     SGPR[CSP*4] = { PC + 4, mask_fail };
-    //     CSP++;
-    //     PC = target_addr;
-    // end.
-    // Conditional branch using branch-stack.
-    // S0 = compare mask(vcc or any sgpr), and
-    // SIMM16 = signed DWORD branch offset relative to next instruction.
-    // See also S_CBRANCH_JOIN.
-    void
-    Inst_SOPK__S_CBRANCH_I_FORK::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPK__S_GETREG_B32 class methods ---
-
-    Inst_SOPK__S_GETREG_B32::Inst_SOPK__S_GETREG_B32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_getreg_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_GETREG_B32
-
-    Inst_SOPK__S_GETREG_B32::~Inst_SOPK__S_GETREG_B32()
-    {
-    } // ~Inst_SOPK__S_GETREG_B32
-
-    // --- description from .arch file ---
-    // D.u = hardware-reg. Read some or all of a hardware register into the
-    // LSBs of D.
-    // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
-    // is 1..32.
-    void
-    Inst_SOPK__S_GETREG_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI16 simm16 = instData.SIMM16;
-        ScalarRegU32 hwregId = simm16 & 0x3f;
-        ScalarRegU32 offset = (simm16 >> 6) & 31;
-        ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;
-
-        ScalarRegU32 hwreg =
-            gpuDynInst->computeUnit()->shader->getHwReg(hwregId);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        sdst.read();
-
-        // Store value from hardware to part of the SDST.
-        ScalarRegU32 mask = (((1U << size) - 1U) << offset);
-        sdst = (hwreg & mask) >> offset;
-        sdst.write();
-    } // execute
-    // --- Inst_SOPK__S_SETREG_B32 class methods ---
-
-    Inst_SOPK__S_SETREG_B32::Inst_SOPK__S_SETREG_B32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_setreg_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_SETREG_B32
-
-    Inst_SOPK__S_SETREG_B32::~Inst_SOPK__S_SETREG_B32()
-    {
-    } // ~Inst_SOPK__S_SETREG_B32
-
-    // --- description from .arch file ---
-    // hardware-reg = S0.u. Write some or all of the LSBs of D into a hardware
-    // register.
-    // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
-    // is 1..32.
-    void
-    Inst_SOPK__S_SETREG_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI16 simm16 = instData.SIMM16;
-        ScalarRegU32 hwregId = simm16 & 0x3f;
-        ScalarRegU32 offset = (simm16 >> 6) & 31;
-        ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;
-
-        ScalarRegU32 hwreg =
-            gpuDynInst->computeUnit()->shader->getHwReg(hwregId);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        sdst.read();
-
-        // Store value from SDST to part of the hardware register.
-        ScalarRegU32 mask = (((1U << size) - 1U) << offset);
-        hwreg = ((hwreg & ~mask) | ((sdst.rawData() << offset) & mask));
-        gpuDynInst->computeUnit()->shader->setHwReg(hwregId, hwreg);
-
-        // set MODE register to control the behavior of single precision
-        // floating-point numbers: denormal mode or round mode
-        if (hwregId==1 && size==2
-                        && (offset==4 || offset==0)) {
-            warn_once("Be cautious that s_setreg_b32 has no real effect "
-                            "on FP modes: %s\n", gpuDynInst->disassemble());
-            return;
-        }
-
-        // panic if not changing MODE of floating-point numbers
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPK__S_SETREG_IMM32_B32 class methods ---
-
-    Inst_SOPK__S_SETREG_IMM32_B32::Inst_SOPK__S_SETREG_IMM32_B32(
-          InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_setreg_imm32_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_SETREG_IMM32_B32
-
-    Inst_SOPK__S_SETREG_IMM32_B32::~Inst_SOPK__S_SETREG_IMM32_B32()
-    {
-    } // ~Inst_SOPK__S_SETREG_IMM32_B32
-
-    // --- description from .arch file ---
-    // Write some or all of the LSBs of IMM32 into a hardware register; this
-    // ---  instruction requires a 32-bit literal constant.
-    // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
-    // is 1..32.
-    void
-    Inst_SOPK__S_SETREG_IMM32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI16 simm16 = instData.SIMM16;
-        ScalarRegU32 hwregId = simm16 & 0x3f;
-        ScalarRegU32 offset = (simm16 >> 6) & 31;
-        ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;
-
-        ScalarRegU32 hwreg =
-            gpuDynInst->computeUnit()->shader->getHwReg(hwregId);
-        ScalarRegI32 simm32 = extData.imm_u32;
-
-        // Store value from SIMM32 to part of the hardware register.
-        ScalarRegU32 mask = (((1U << size) - 1U) << offset);
-        hwreg = ((hwreg & ~mask) | ((simm32 << offset) & mask));
-        gpuDynInst->computeUnit()->shader->setHwReg(hwregId, hwreg);
-
-        // set MODE register to control the behavior of single precision
-        // floating-point numbers: denormal mode or round mode
-        if (hwregId==HW_REG_MODE && size==2
-                        && (offset==4 || offset==0)) {
-            warn_once("Be cautious that s_setreg_imm32_b32 has no real effect "
-                            "on FP modes: %s\n", gpuDynInst->disassemble());
-            return;
-        }
-
-        // panic if not changing modes of single-precision FPs
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOP1__S_MOV_B32 class methods ---
-
-    Inst_SOP1__S_MOV_B32::Inst_SOP1__S_MOV_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_mov_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_MOV_B32
-
-    Inst_SOP1__S_MOV_B32::~Inst_SOP1__S_MOV_B32()
-    {
-    } // ~Inst_SOP1__S_MOV_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u.
-    void
-    Inst_SOP1__S_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = src.rawData();
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_MOV_B64 class methods ---
-
-    Inst_SOP1__S_MOV_B64::Inst_SOP1__S_MOV_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_mov_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_MOV_B64
-
-    Inst_SOP1__S_MOV_B64::~Inst_SOP1__S_MOV_B64()
-    {
-    } // ~Inst_SOP1__S_MOV_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S0.u64.
-    void
-    Inst_SOP1__S_MOV_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = src.rawData();
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_CMOV_B32 class methods ---
-
-    Inst_SOP1__S_CMOV_B32::Inst_SOP1__S_CMOV_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_cmov_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_CMOV_B32
-
-    Inst_SOP1__S_CMOV_B32::~Inst_SOP1__S_CMOV_B32()
-    {
-    } // ~Inst_SOP1__S_CMOV_B32
-
-    // --- description from .arch file ---
-    // (SCC) then D.u = S0.u;
-    // else NOP.
-    // Conditional move.
-    void
-    Inst_SOP1__S_CMOV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-        scc.read();
-
-        if (scc.rawData()) {
-            sdst = src.rawData();
-            sdst.write();
-        }
-    } // execute
-    // --- Inst_SOP1__S_CMOV_B64 class methods ---
-
-    Inst_SOP1__S_CMOV_B64::Inst_SOP1__S_CMOV_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_cmov_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_CMOV_B64
-
-    Inst_SOP1__S_CMOV_B64::~Inst_SOP1__S_CMOV_B64()
-    {
-    } // ~Inst_SOP1__S_CMOV_B64
-
-    // --- description from .arch file ---
-    // if(SCC) then D.u64 = S0.u64;
-    // else NOP.
-    // Conditional move.
-    void
-    Inst_SOP1__S_CMOV_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-        scc.read();
-
-        if (scc.rawData()) {
-            sdst = src.rawData();
-            sdst.write();
-        }
-    } // execute
-    // --- Inst_SOP1__S_NOT_B32 class methods ---
-
-    Inst_SOP1__S_NOT_B32::Inst_SOP1__S_NOT_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_not_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_NOT_B32
-
-    Inst_SOP1__S_NOT_B32::~Inst_SOP1__S_NOT_B32()
-    {
-    } // ~Inst_SOP1__S_NOT_B32
-
-    // --- description from .arch file ---
-    // D.u = ~S0.u;
-    // SCC = 1 if result is non-zero.
-    // Bitwise negation.
-    void
-    Inst_SOP1__S_NOT_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = ~src.rawData();
-
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_NOT_B64 class methods ---
-
-    Inst_SOP1__S_NOT_B64::Inst_SOP1__S_NOT_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_not_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_NOT_B64
-
-    Inst_SOP1__S_NOT_B64::~Inst_SOP1__S_NOT_B64()
-    {
-    } // ~Inst_SOP1__S_NOT_B64
-
-    // --- description from .arch file ---
-    // D.u64 = ~S0.u64;
-    // SCC = 1 if result is non-zero.
-    // Bitwise negation.
-    void
-    Inst_SOP1__S_NOT_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = ~src.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_WQM_B32 class methods ---
-
-    Inst_SOP1__S_WQM_B32::Inst_SOP1__S_WQM_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_wqm_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_WQM_B32
-
-    Inst_SOP1__S_WQM_B32::~Inst_SOP1__S_WQM_B32()
-    {
-    } // ~Inst_SOP1__S_WQM_B32
-
-    // --- description from .arch file ---
-    // D[i] = (S0[(i & ~3):(i | 3)] != 0);
-    // Computes whole quad mode for an active/valid mask.
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_WQM_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wholeQuadMode(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_WQM_B64 class methods ---
-
-    Inst_SOP1__S_WQM_B64::Inst_SOP1__S_WQM_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_wqm_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_WQM_B64
-
-    Inst_SOP1__S_WQM_B64::~Inst_SOP1__S_WQM_B64()
-    {
-    } // ~Inst_SOP1__S_WQM_B64
-
-    // --- description from .arch file ---
-    // D[i] = (S0[(i & ~3):(i | 3)] != 0);
-    // Computes whole quad mode for an active/valid mask.
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_WQM_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wholeQuadMode(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_BREV_B32 class methods ---
-
-    Inst_SOP1__S_BREV_B32::Inst_SOP1__S_BREV_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_brev_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BREV_B32
-
-    Inst_SOP1__S_BREV_B32::~Inst_SOP1__S_BREV_B32()
-    {
-    } // ~Inst_SOP1__S_BREV_B32
-
-    // --- description from .arch file ---
-    // D.u[31:0] = S0.u[0:31] (reverse bits).
-    void
-    Inst_SOP1__S_BREV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = reverseBits(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_BREV_B64 class methods ---
-
-    Inst_SOP1__S_BREV_B64::Inst_SOP1__S_BREV_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_brev_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BREV_B64
-
-    Inst_SOP1__S_BREV_B64::~Inst_SOP1__S_BREV_B64()
-    {
-    } // ~Inst_SOP1__S_BREV_B64
-
-    // --- description from .arch file ---
-    // D.u64[63:0] = S0.u64[0:63] (reverse bits).
-    void
-    Inst_SOP1__S_BREV_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = reverseBits(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_BCNT0_I32_B32 class methods ---
-
-    Inst_SOP1__S_BCNT0_I32_B32::Inst_SOP1__S_BCNT0_I32_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bcnt0_i32_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BCNT0_I32_B32
-
-    Inst_SOP1__S_BCNT0_I32_B32::~Inst_SOP1__S_BCNT0_I32_B32()
-    {
-    } // ~Inst_SOP1__S_BCNT0_I32_B32
-
-    // --- description from .arch file ---
-    // D.i = CountZeroBits(S0.u);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_BCNT0_I32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = countZeroBits(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_BCNT0_I32_B64 class methods ---
-
-    Inst_SOP1__S_BCNT0_I32_B64::Inst_SOP1__S_BCNT0_I32_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bcnt0_i32_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BCNT0_I32_B64
-
-    Inst_SOP1__S_BCNT0_I32_B64::~Inst_SOP1__S_BCNT0_I32_B64()
-    {
-    } // ~Inst_SOP1__S_BCNT0_I32_B64
-
-    // --- description from .arch file ---
-    // D.i = CountZeroBits(S0.u64);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_BCNT0_I32_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = countZeroBits(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_BCNT1_I32_B32 class methods ---
-
-    Inst_SOP1__S_BCNT1_I32_B32::Inst_SOP1__S_BCNT1_I32_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bcnt1_i32_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BCNT1_I32_B32
-
-    Inst_SOP1__S_BCNT1_I32_B32::~Inst_SOP1__S_BCNT1_I32_B32()
-    {
-    } // ~Inst_SOP1__S_BCNT1_I32_B32
-
-    // --- description from .arch file ---
-    // D.i = CountOneBits(S0.u);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_BCNT1_I32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = popCount(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_BCNT1_I32_B64 class methods ---
-
-    Inst_SOP1__S_BCNT1_I32_B64::Inst_SOP1__S_BCNT1_I32_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bcnt1_i32_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BCNT1_I32_B64
-
-    Inst_SOP1__S_BCNT1_I32_B64::~Inst_SOP1__S_BCNT1_I32_B64()
-    {
-    } // ~Inst_SOP1__S_BCNT1_I32_B64
-
-    // --- description from .arch file ---
-    // D.i = CountOneBits(S0.u64);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_BCNT1_I32_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = popCount(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_FF0_I32_B32 class methods ---
-
-    Inst_SOP1__S_FF0_I32_B32::Inst_SOP1__S_FF0_I32_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_ff0_i32_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FF0_I32_B32
-
-    Inst_SOP1__S_FF0_I32_B32::~Inst_SOP1__S_FF0_I32_B32()
-    {
-    } // ~Inst_SOP1__S_FF0_I32_B32
-
-    // --- description from .arch file ---
-    // D.i = FindFirstZero(S0.u);
-    // If no zeros are found, return -1.
-    // Returns the bit position of the first zero from the LSB.
-    void
-    Inst_SOP1__S_FF0_I32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = findFirstZero(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_FF0_I32_B64 class methods ---
-
-    Inst_SOP1__S_FF0_I32_B64::Inst_SOP1__S_FF0_I32_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_ff0_i32_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FF0_I32_B64
-
-    Inst_SOP1__S_FF0_I32_B64::~Inst_SOP1__S_FF0_I32_B64()
-    {
-    } // ~Inst_SOP1__S_FF0_I32_B64
-
-    // --- description from .arch file ---
-    // D.i = FindFirstZero(S0.u64);
-    // If no zeros are found, return -1.
-    // Returns the bit position of the first zero from the LSB.
-    void
-    Inst_SOP1__S_FF0_I32_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = findFirstZero(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_FF1_I32_B32 class methods ---
-
-    Inst_SOP1__S_FF1_I32_B32::Inst_SOP1__S_FF1_I32_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_ff1_i32_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FF1_I32_B32
-
-    Inst_SOP1__S_FF1_I32_B32::~Inst_SOP1__S_FF1_I32_B32()
-    {
-    } // ~Inst_SOP1__S_FF1_I32_B32
-
-    // --- description from .arch file ---
-    // D.i = FindFirstOne(S0.u);
-    // If no ones are found, return -1.
-    // Returns the bit position of the first one from the LSB.
-    void
-    Inst_SOP1__S_FF1_I32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = findFirstOne(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_FF1_I32_B64 class methods ---
-
-    Inst_SOP1__S_FF1_I32_B64::Inst_SOP1__S_FF1_I32_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_ff1_i32_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FF1_I32_B64
-
-    Inst_SOP1__S_FF1_I32_B64::~Inst_SOP1__S_FF1_I32_B64()
-    {
-    } // ~Inst_SOP1__S_FF1_I32_B64
-
-    // --- description from .arch file ---
-    // D.i = FindFirstOne(S0.u64);
-    // If no ones are found, return -1.
-    // Returns the bit position of the first one from the LSB.
-    void
-    Inst_SOP1__S_FF1_I32_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = findFirstOne(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_FLBIT_I32_B32 class methods ---
-
-    Inst_SOP1__S_FLBIT_I32_B32::Inst_SOP1__S_FLBIT_I32_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_flbit_i32_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FLBIT_I32_B32
-
-    Inst_SOP1__S_FLBIT_I32_B32::~Inst_SOP1__S_FLBIT_I32_B32()
-    {
-    } // ~Inst_SOP1__S_FLBIT_I32_B32
-
-    // --- description from .arch file ---
-    // D.i = FindFirstOne(S0.u);
-    // If no ones are found, return -1.
-    // Counts how many zeros before the first one starting from the MSB.
-    void
-    Inst_SOP1__S_FLBIT_I32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = countZeroBitsMsb(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_FLBIT_I32_B64 class methods ---
-
-    Inst_SOP1__S_FLBIT_I32_B64::Inst_SOP1__S_FLBIT_I32_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_flbit_i32_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FLBIT_I32_B64
-
-    Inst_SOP1__S_FLBIT_I32_B64::~Inst_SOP1__S_FLBIT_I32_B64()
-    {
-    } // ~Inst_SOP1__S_FLBIT_I32_B64
-
-    // --- description from .arch file ---
-    // D.i = FindFirstOne(S0.u64);
-    // If no ones are found, return -1.
-    // Counts how many zeros before the first one starting from the MSB.
-    void
-    Inst_SOP1__S_FLBIT_I32_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = countZeroBitsMsb(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_FLBIT_I32 class methods ---
-
-    Inst_SOP1__S_FLBIT_I32::Inst_SOP1__S_FLBIT_I32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_flbit_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FLBIT_I32
-
-    Inst_SOP1__S_FLBIT_I32::~Inst_SOP1__S_FLBIT_I32()
-    {
-    } // ~Inst_SOP1__S_FLBIT_I32
-
-    // --- description from .arch file ---
-    // D.i = FirstOppositeSignBit(S0.i);
-    // If S0.i == 0 or S0.i == -1 (all bits are the same), return -1.
-    // Counts how many bits in a row (from MSB to LSB) are the same as the
-    // sign bit.
-    void
-    Inst_SOP1__S_FLBIT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = firstOppositeSignBit(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_FLBIT_I32_I64 class methods ---
-
-    Inst_SOP1__S_FLBIT_I32_I64::Inst_SOP1__S_FLBIT_I32_I64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_flbit_i32_i64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FLBIT_I32_I64
-
-    Inst_SOP1__S_FLBIT_I32_I64::~Inst_SOP1__S_FLBIT_I32_I64()
-    {
-    } // ~Inst_SOP1__S_FLBIT_I32_I64
-
-    // --- description from .arch file ---
-    // D.i = FirstOppositeSignBit(S0.i64);
-    // If S0.i == 0 or S0.i == -1 (all bits are the same), return -1.
-    // Counts how many bits in a row (from MSB to LSB) are the same as the
-    // sign bit.
-    void
-    Inst_SOP1__S_FLBIT_I32_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = firstOppositeSignBit(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_SEXT_I32_I8 class methods ---
-
-    Inst_SOP1__S_SEXT_I32_I8::Inst_SOP1__S_SEXT_I32_I8(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_sext_i32_i8")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_SEXT_I32_I8
-
-    Inst_SOP1__S_SEXT_I32_I8::~Inst_SOP1__S_SEXT_I32_I8()
-    {
-    } // ~Inst_SOP1__S_SEXT_I32_I8
-
-    // --- description from .arch file ---
-    // D.i = signext(S0.i[7:0]) (sign extension).
-    void
-    Inst_SOP1__S_SEXT_I32_I8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = sext<std::numeric_limits<ScalarRegI8>::digits>(
-            bits(src.rawData(), 7, 0));
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_SEXT_I32_I16 class methods ---
-
-    Inst_SOP1__S_SEXT_I32_I16::Inst_SOP1__S_SEXT_I32_I16(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_sext_i32_i16")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_SEXT_I32_I16
-
-    Inst_SOP1__S_SEXT_I32_I16::~Inst_SOP1__S_SEXT_I32_I16()
-    {
-    } // ~Inst_SOP1__S_SEXT_I32_I16
-
-    // --- description from .arch file ---
-    // D.i = signext(S0.i[15:0]) (sign extension).
-    void
-    Inst_SOP1__S_SEXT_I32_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = sext<std::numeric_limits<ScalarRegI16>::digits>(
-            bits(src.rawData(), 15, 0));
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_BITSET0_B32 class methods ---
-
-    Inst_SOP1__S_BITSET0_B32::Inst_SOP1__S_BITSET0_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bitset0_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BITSET0_B32
-
-    Inst_SOP1__S_BITSET0_B32::~Inst_SOP1__S_BITSET0_B32()
-    {
-    } // ~Inst_SOP1__S_BITSET0_B32
-
-    // --- description from .arch file ---
-    // D.u[S0.u[4:0]] = 0.
-    void
-    Inst_SOP1__S_BITSET0_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst.setBit(bits(src.rawData(), 4, 0), 0);
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_BITSET0_B64 class methods ---
-
-    Inst_SOP1__S_BITSET0_B64::Inst_SOP1__S_BITSET0_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bitset0_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BITSET0_B64
-
-    Inst_SOP1__S_BITSET0_B64::~Inst_SOP1__S_BITSET0_B64()
-    {
-    } // ~Inst_SOP1__S_BITSET0_B64
-
-    // --- description from .arch file ---
-    // D.u64[S0.u[5:0]] = 0.
-    void
-    Inst_SOP1__S_BITSET0_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst.setBit(bits(src.rawData(), 5, 0), 0);
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_BITSET1_B32 class methods ---
-
-    Inst_SOP1__S_BITSET1_B32::Inst_SOP1__S_BITSET1_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bitset1_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BITSET1_B32
-
-    Inst_SOP1__S_BITSET1_B32::~Inst_SOP1__S_BITSET1_B32()
-    {
-    } // ~Inst_SOP1__S_BITSET1_B32
-
-    // --- description from .arch file ---
-    // D.u[S0.u[4:0]] = 1.
-    void
-    Inst_SOP1__S_BITSET1_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst.setBit(bits(src.rawData(), 4, 0), 1);
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_BITSET1_B64 class methods ---
-
-    Inst_SOP1__S_BITSET1_B64::Inst_SOP1__S_BITSET1_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bitset1_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BITSET1_B64
-
-    Inst_SOP1__S_BITSET1_B64::~Inst_SOP1__S_BITSET1_B64()
-    {
-    } // ~Inst_SOP1__S_BITSET1_B64
-
-    // --- description from .arch file ---
-    // D.u64[S0.u[5:0]] = 1.
-    void
-    Inst_SOP1__S_BITSET1_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst.setBit(bits(src.rawData(), 5, 0), 1);
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_GETPC_B64 class methods ---
-
-    Inst_SOP1__S_GETPC_B64::Inst_SOP1__S_GETPC_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_getpc_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_GETPC_B64
-
-    Inst_SOP1__S_GETPC_B64::~Inst_SOP1__S_GETPC_B64()
-    {
-    } // ~Inst_SOP1__S_GETPC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = PC + 4.
-    // Destination receives the byte address of the next instruction.
-    void
-    Inst_SOP1__S_GETPC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Addr pc = gpuDynInst->pc();
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        sdst = pc + 4;
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_SETPC_B64 class methods ---
-
-    Inst_SOP1__S_SETPC_B64::Inst_SOP1__S_SETPC_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_setpc_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_SETPC_B64
-
-    Inst_SOP1__S_SETPC_B64::~Inst_SOP1__S_SETPC_B64()
-    {
-    } // ~Inst_SOP1__S_SETPC_B64
-
-    // --- description from .arch file ---
-    // PC = S0.u64.
-    // S0.u64 is a byte address of the instruction to jump to.
-    void
-    Inst_SOP1__S_SETPC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-
-        src.read();
-
-        wf->pc(src.rawData());
-    } // execute
-    // --- Inst_SOP1__S_SWAPPC_B64 class methods ---
-
-    Inst_SOP1__S_SWAPPC_B64::Inst_SOP1__S_SWAPPC_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_swappc_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_SWAPPC_B64
-
-    Inst_SOP1__S_SWAPPC_B64::~Inst_SOP1__S_SWAPPC_B64()
-    {
-    } // ~Inst_SOP1__S_SWAPPC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = PC + 4; PC = S0.u64.
-    // S0.u64 is a byte address of the instruction to jump to.
-    void
-    Inst_SOP1__S_SWAPPC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        Addr pc = gpuDynInst->pc();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = pc + 4;
-
-        wf->pc(src.rawData());
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_RFE_B64 class methods ---
-
-    Inst_SOP1__S_RFE_B64::Inst_SOP1__S_RFE_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_rfe_b64")
-    {
-    } // Inst_SOP1__S_RFE_B64
-
-    Inst_SOP1__S_RFE_B64::~Inst_SOP1__S_RFE_B64()
-    {
-    } // ~Inst_SOP1__S_RFE_B64
-
-    // --- description from .arch file ---
-    // PRIV = 0;
-    // PC = S0.u64.
-    // Return from exception handler and continue.
-    // This instruction may only be used within a trap handler.
-    void
-    Inst_SOP1__S_RFE_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOP1__S_AND_SAVEEXEC_B64 class methods ---
-
-    Inst_SOP1__S_AND_SAVEEXEC_B64::Inst_SOP1__S_AND_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_and_saveexec_b64")
-    {
-        setFlag(ALU);
-        setFlag(ReadsEXEC);
-        setFlag(WritesEXEC);
-    } // Inst_SOP1__S_AND_SAVEEXEC_B64
-
-    Inst_SOP1__S_AND_SAVEEXEC_B64::~Inst_SOP1__S_AND_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_AND_SAVEEXEC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = EXEC;
-    // EXEC = S0.u64 & EXEC;
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_AND_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = src.rawData() & wf->execMask().to_ullong();
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_OR_SAVEEXEC_B64 class methods ---
-
-    Inst_SOP1__S_OR_SAVEEXEC_B64::Inst_SOP1__S_OR_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_or_saveexec_b64")
-    {
-        setFlag(ALU);
-        setFlag(ReadsEXEC);
-        setFlag(WritesEXEC);
-    } // Inst_SOP1__S_OR_SAVEEXEC_B64
-
-    Inst_SOP1__S_OR_SAVEEXEC_B64::~Inst_SOP1__S_OR_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_OR_SAVEEXEC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = EXEC;
-    // EXEC = S0.u64 | EXEC;
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_OR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = src.rawData() | wf->execMask().to_ullong();
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_XOR_SAVEEXEC_B64 class methods ---
-
-    Inst_SOP1__S_XOR_SAVEEXEC_B64::Inst_SOP1__S_XOR_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_xor_saveexec_b64")
-    {
-        setFlag(ALU);
-        setFlag(ReadsEXEC);
-        setFlag(WritesEXEC);
-    } // Inst_SOP1__S_XOR_SAVEEXEC_B64
-
-    Inst_SOP1__S_XOR_SAVEEXEC_B64::~Inst_SOP1__S_XOR_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_XOR_SAVEEXEC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = EXEC;
-    // EXEC = S0.u64 ^ EXEC;
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_XOR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = src.rawData() ^ wf->execMask().to_ullong();
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_ANDN2_SAVEEXEC_B64 class methods ---
-
-    Inst_SOP1__S_ANDN2_SAVEEXEC_B64::Inst_SOP1__S_ANDN2_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_andn2_saveexec_b64")
-    {
-        setFlag(ALU);
-        setFlag(ReadsEXEC);
-        setFlag(WritesEXEC);
-    } // Inst_SOP1__S_ANDN2_SAVEEXEC_B64
-
-    Inst_SOP1__S_ANDN2_SAVEEXEC_B64::~Inst_SOP1__S_ANDN2_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_ANDN2_SAVEEXEC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = EXEC;
-    // EXEC = S0.u64 & ~EXEC;
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_ANDN2_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = src.rawData() &~ wf->execMask().to_ullong();
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_ORN2_SAVEEXEC_B64 class methods ---
-
-    Inst_SOP1__S_ORN2_SAVEEXEC_B64::Inst_SOP1__S_ORN2_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_orn2_saveexec_b64")
-    {
-        setFlag(ALU);
-        setFlag(ReadsEXEC);
-        setFlag(WritesEXEC);
-    } // Inst_SOP1__S_ORN2_SAVEEXEC_B64
-
-    Inst_SOP1__S_ORN2_SAVEEXEC_B64::~Inst_SOP1__S_ORN2_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_ORN2_SAVEEXEC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = EXEC;
-    // EXEC = S0.u64 | ~EXEC;
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_ORN2_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = src.rawData() |~ wf->execMask().to_ullong();
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_NAND_SAVEEXEC_B64 class methods ---
-
-    Inst_SOP1__S_NAND_SAVEEXEC_B64::Inst_SOP1__S_NAND_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_nand_saveexec_b64")
-    {
-        setFlag(ALU);
-        setFlag(ReadsEXEC);
-        setFlag(WritesEXEC);
-    } // Inst_SOP1__S_NAND_SAVEEXEC_B64
-
-    Inst_SOP1__S_NAND_SAVEEXEC_B64::~Inst_SOP1__S_NAND_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_NAND_SAVEEXEC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = EXEC;
-    // EXEC = ~(S0.u64 & EXEC);
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_NAND_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = ~(src.rawData() & wf->execMask().to_ullong());
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_NOR_SAVEEXEC_B64 class methods ---
-
-    Inst_SOP1__S_NOR_SAVEEXEC_B64::Inst_SOP1__S_NOR_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_nor_saveexec_b64")
-    {
-        setFlag(ALU);
-        setFlag(ReadsEXEC);
-        setFlag(WritesEXEC);
-    } // Inst_SOP1__S_NOR_SAVEEXEC_B64
-
-    Inst_SOP1__S_NOR_SAVEEXEC_B64::~Inst_SOP1__S_NOR_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_NOR_SAVEEXEC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = EXEC;
-    // EXEC = ~(S0.u64 | EXEC);
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_NOR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = ~(src.rawData() | wf->execMask().to_ullong());
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_XNOR_SAVEEXEC_B64 class methods ---
-
-    Inst_SOP1__S_XNOR_SAVEEXEC_B64::Inst_SOP1__S_XNOR_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_xnor_saveexec_b64")
-    {
-        setFlag(ALU);
-        setFlag(ReadsEXEC);
-        setFlag(WritesEXEC);
-    } // Inst_SOP1__S_XNOR_SAVEEXEC_B64
-
-    Inst_SOP1__S_XNOR_SAVEEXEC_B64::~Inst_SOP1__S_XNOR_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_XNOR_SAVEEXEC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = EXEC;
-    // EXEC = ~(S0.u64 ^ EXEC);
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_XNOR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = ~(src.rawData() ^ wf->execMask().to_ullong());
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_QUADMASK_B32 class methods ---
-
-    Inst_SOP1__S_QUADMASK_B32::Inst_SOP1__S_QUADMASK_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_quadmask_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_QUADMASK_B32
-
-    Inst_SOP1__S_QUADMASK_B32::~Inst_SOP1__S_QUADMASK_B32()
-    {
-    } // ~Inst_SOP1__S_QUADMASK_B32
-
-    // --- description from .arch file ---
-    // D.u = QuadMask(S0.u):
-    // D[0] = OR(S0[3:0]), D[1] = OR(S0[7:4]) ... D[31:8] = 0;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_QUADMASK_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = quadMask(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_QUADMASK_B64 class methods ---
-
-    Inst_SOP1__S_QUADMASK_B64::Inst_SOP1__S_QUADMASK_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_quadmask_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_QUADMASK_B64
-
-    Inst_SOP1__S_QUADMASK_B64::~Inst_SOP1__S_QUADMASK_B64()
-    {
-    } // ~Inst_SOP1__S_QUADMASK_B64
-
-    // --- description from .arch file ---
-    // D.u64 = QuadMask(S0.u64):
-    // D[0] = OR(S0[3:0]), D[1] = OR(S0[7:4]) ... D[63:16] = 0;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_QUADMASK_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = quadMask(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_MOVRELS_B32 class methods ---
-
-    Inst_SOP1__S_MOVRELS_B32::Inst_SOP1__S_MOVRELS_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_movrels_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_MOVRELS_B32
-
-    Inst_SOP1__S_MOVRELS_B32::~Inst_SOP1__S_MOVRELS_B32()
-    {
-    } // ~Inst_SOP1__S_MOVRELS_B32
-
-    // --- description from .arch file ---
-    // D.u = SGPR[S0.u + M0.u].u (move from relative source).
-    void
-    Inst_SOP1__S_MOVRELS_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
-        m0.read();
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0 + m0.rawData());
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = src.rawData();
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_MOVRELS_B64 class methods ---
-
-    Inst_SOP1__S_MOVRELS_B64::Inst_SOP1__S_MOVRELS_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_movrels_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_MOVRELS_B64
-
-    Inst_SOP1__S_MOVRELS_B64::~Inst_SOP1__S_MOVRELS_B64()
-    {
-    } // ~Inst_SOP1__S_MOVRELS_B64
-
-    // --- description from .arch file ---
-    // D.u64 = SGPR[S0.u + M0.u].u64 (move from relative source).
-    // The index in M0.u must be even for this operation.
-    void
-    Inst_SOP1__S_MOVRELS_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
-        m0.read();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0 + m0.rawData());
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = src.rawData();
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_MOVRELD_B32 class methods ---
-
-    Inst_SOP1__S_MOVRELD_B32::Inst_SOP1__S_MOVRELD_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_movreld_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_MOVRELD_B32
-
-    Inst_SOP1__S_MOVRELD_B32::~Inst_SOP1__S_MOVRELD_B32()
-    {
-    } // ~Inst_SOP1__S_MOVRELD_B32
-
-    // --- description from .arch file ---
-    // SGPR[D.u + M0.u].u = S0.u (move to relative destination).
-    void
-    Inst_SOP1__S_MOVRELD_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
-        m0.read();
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST + m0.rawData());
-
-        src.read();
-
-        sdst = src.rawData();
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_MOVRELD_B64 class methods ---
-
-    Inst_SOP1__S_MOVRELD_B64::Inst_SOP1__S_MOVRELD_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_movreld_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_MOVRELD_B64
-
-    Inst_SOP1__S_MOVRELD_B64::~Inst_SOP1__S_MOVRELD_B64()
-    {
-    } // ~Inst_SOP1__S_MOVRELD_B64
-
-    // --- description from .arch file ---
-    // SGPR[D.u + M0.u].u64 = S0.u64 (move to relative destination).
-    // The index in M0.u must be even for this operation.
-    void
-    Inst_SOP1__S_MOVRELD_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
-        m0.read();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST + m0.rawData());
-
-        src.read();
-
-        sdst = src.rawData();
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_CBRANCH_JOIN class methods ---
-
-    Inst_SOP1__S_CBRANCH_JOIN::Inst_SOP1__S_CBRANCH_JOIN(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_cbranch_join")
-    {
-        setFlag(Branch);
-        setFlag(WritesEXEC);
-    } // Inst_SOP1__S_CBRANCH_JOIN
-
-    Inst_SOP1__S_CBRANCH_JOIN::~Inst_SOP1__S_CBRANCH_JOIN()
-    {
-    } // ~Inst_SOP1__S_CBRANCH_JOIN
-
-    // --- description from .arch file ---
-    // saved_csp = S0.u;
-    // if(CSP == saved_csp) then
-    //     PC += 4; // Second time to JOIN: continue with program.
-    // else
-    //     CSP -= 1; // First time to JOIN; jump to other FORK path.
-    //     {PC, EXEC} = SGPR[CSP * 4]; // Read 128 bits from 4 consecutive
-    //     SGPRs.
-    // end
-    // Conditional branch join point (end of conditional branch block). S0 is
-    // saved CSP value.
-    // See S_CBRANCH_G_FORK and S_CBRANCH_I_FORK for related instructions.
-    void
-    Inst_SOP1__S_CBRANCH_JOIN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOP1__S_ABS_I32 class methods ---
-
-    Inst_SOP1__S_ABS_I32::Inst_SOP1__S_ABS_I32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_abs_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_ABS_I32
-
-    Inst_SOP1__S_ABS_I32::~Inst_SOP1__S_ABS_I32()
-    {
-    } // ~Inst_SOP1__S_ABS_I32
-
-    // --- description from .arch file ---
-    // if(S.i < 0) then D.i = -S.i;
-    // else D.i = S.i;
-    // SCC = 1 if result is non-zero.
-    // Integer absolute value.
-    void
-    Inst_SOP1__S_ABS_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = std::abs(src.rawData());
-
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_MOV_FED_B32 class methods ---
-
-    Inst_SOP1__S_MOV_FED_B32::Inst_SOP1__S_MOV_FED_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_mov_fed_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_MOV_FED_B32
-
-    Inst_SOP1__S_MOV_FED_B32::~Inst_SOP1__S_MOV_FED_B32()
-    {
-    } // ~Inst_SOP1__S_MOV_FED_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u. Introduce an EDC double-detect error on write to the
-    // destination SGPR.
-    void
-    Inst_SOP1__S_MOV_FED_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOP1__S_SET_GPR_IDX_IDX class methods ---
-
-    Inst_SOP1__S_SET_GPR_IDX_IDX::Inst_SOP1__S_SET_GPR_IDX_IDX(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_set_gpr_idx_idx")
-    {
-    } // Inst_SOP1__S_SET_GPR_IDX_IDX
-
-    Inst_SOP1__S_SET_GPR_IDX_IDX::~Inst_SOP1__S_SET_GPR_IDX_IDX()
-    {
-    } // ~Inst_SOP1__S_SET_GPR_IDX_IDX
-
-    // --- description from .arch file ---
-    // M0[7:0] = S0.u[7:0].
-    // Modify the index used in vector GPR indexing.
-    void
-    Inst_SOP1__S_SET_GPR_IDX_IDX::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPC__S_CMP_EQ_I32 class methods ---
-
-    Inst_SOPC__S_CMP_EQ_I32::Inst_SOPC__S_CMP_EQ_I32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_eq_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_EQ_I32
-
-    Inst_SOPC__S_CMP_EQ_I32::~Inst_SOPC__S_CMP_EQ_I32()
-    {
-    } // ~Inst_SOPC__S_CMP_EQ_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i == S1.i).
-    void
-    Inst_SOPC__S_CMP_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() == src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_LG_I32 class methods ---
-
-    Inst_SOPC__S_CMP_LG_I32::Inst_SOPC__S_CMP_LG_I32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_lg_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_LG_I32
-
-    Inst_SOPC__S_CMP_LG_I32::~Inst_SOPC__S_CMP_LG_I32()
-    {
-    } // ~Inst_SOPC__S_CMP_LG_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i != S1.i).
-    void
-    Inst_SOPC__S_CMP_LG_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() != src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_GT_I32 class methods ---
-
-    Inst_SOPC__S_CMP_GT_I32::Inst_SOPC__S_CMP_GT_I32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_gt_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_GT_I32
-
-    Inst_SOPC__S_CMP_GT_I32::~Inst_SOPC__S_CMP_GT_I32()
-    {
-    } // ~Inst_SOPC__S_CMP_GT_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i > S1.i).
-    void
-    Inst_SOPC__S_CMP_GT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_GE_I32 class methods ---
-
-    Inst_SOPC__S_CMP_GE_I32::Inst_SOPC__S_CMP_GE_I32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_ge_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_GE_I32
-
-    Inst_SOPC__S_CMP_GE_I32::~Inst_SOPC__S_CMP_GE_I32()
-    {
-    } // ~Inst_SOPC__S_CMP_GE_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i >= S1.i).
-    void
-    Inst_SOPC__S_CMP_GE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() >= src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_LT_I32 class methods ---
-
-    Inst_SOPC__S_CMP_LT_I32::Inst_SOPC__S_CMP_LT_I32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_lt_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_LT_I32
-
-    Inst_SOPC__S_CMP_LT_I32::~Inst_SOPC__S_CMP_LT_I32()
-    {
-    } // ~Inst_SOPC__S_CMP_LT_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i < S1.i).
-    void
-    Inst_SOPC__S_CMP_LT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_LE_I32 class methods ---
-
-    Inst_SOPC__S_CMP_LE_I32::Inst_SOPC__S_CMP_LE_I32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_le_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_LE_I32
-
-    Inst_SOPC__S_CMP_LE_I32::~Inst_SOPC__S_CMP_LE_I32()
-    {
-    } // ~Inst_SOPC__S_CMP_LE_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i <= S1.i).
-    void
-    Inst_SOPC__S_CMP_LE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() <= src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_EQ_U32 class methods ---
-
-    Inst_SOPC__S_CMP_EQ_U32::Inst_SOPC__S_CMP_EQ_U32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_eq_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_EQ_U32
-
-    Inst_SOPC__S_CMP_EQ_U32::~Inst_SOPC__S_CMP_EQ_U32()
-    {
-    } // ~Inst_SOPC__S_CMP_EQ_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u == S1.u).
-    void
-    Inst_SOPC__S_CMP_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() == src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_LG_U32 class methods ---
-
-    Inst_SOPC__S_CMP_LG_U32::Inst_SOPC__S_CMP_LG_U32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_lg_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_LG_U32
-
-    Inst_SOPC__S_CMP_LG_U32::~Inst_SOPC__S_CMP_LG_U32()
-    {
-    } // ~Inst_SOPC__S_CMP_LG_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u != S1.u).
-    void
-    Inst_SOPC__S_CMP_LG_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() != src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_GT_U32 class methods ---
-
-    Inst_SOPC__S_CMP_GT_U32::Inst_SOPC__S_CMP_GT_U32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_gt_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_GT_U32
-
-    Inst_SOPC__S_CMP_GT_U32::~Inst_SOPC__S_CMP_GT_U32()
-    {
-    } // ~Inst_SOPC__S_CMP_GT_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u > S1.u).
-    void
-    Inst_SOPC__S_CMP_GT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_GE_U32 class methods ---
-
-    Inst_SOPC__S_CMP_GE_U32::Inst_SOPC__S_CMP_GE_U32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_ge_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_GE_U32
-
-    Inst_SOPC__S_CMP_GE_U32::~Inst_SOPC__S_CMP_GE_U32()
-    {
-    } // ~Inst_SOPC__S_CMP_GE_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u >= S1.u).
-    void
-    Inst_SOPC__S_CMP_GE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() >= src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_LT_U32 class methods ---
-
-    Inst_SOPC__S_CMP_LT_U32::Inst_SOPC__S_CMP_LT_U32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_lt_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_LT_U32
-
-    Inst_SOPC__S_CMP_LT_U32::~Inst_SOPC__S_CMP_LT_U32()
-    {
-    } // ~Inst_SOPC__S_CMP_LT_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u < S1.u).
-    void
-    Inst_SOPC__S_CMP_LT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_LE_U32 class methods ---
-
-    Inst_SOPC__S_CMP_LE_U32::Inst_SOPC__S_CMP_LE_U32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_le_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_LE_U32
-
-    Inst_SOPC__S_CMP_LE_U32::~Inst_SOPC__S_CMP_LE_U32()
-    {
-    } // ~Inst_SOPC__S_CMP_LE_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u <= S1.u).
-    void
-    Inst_SOPC__S_CMP_LE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() <= src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_BITCMP0_B32 class methods ---
-
-    Inst_SOPC__S_BITCMP0_B32::Inst_SOPC__S_BITCMP0_B32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_bitcmp0_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_BITCMP0_B32
-
-    Inst_SOPC__S_BITCMP0_B32::~Inst_SOPC__S_BITCMP0_B32()
-    {
-    } // ~Inst_SOPC__S_BITCMP0_B32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u[S1.u[4:0]] == 0).
-    void
-    Inst_SOPC__S_BITCMP0_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = !bits(src0.rawData(), bits(src1.rawData(), 4, 0)) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_BITCMP1_B32 class methods ---
-
-    Inst_SOPC__S_BITCMP1_B32::Inst_SOPC__S_BITCMP1_B32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_bitcmp1_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_BITCMP1_B32
-
-    Inst_SOPC__S_BITCMP1_B32::~Inst_SOPC__S_BITCMP1_B32()
-    {
-    } // ~Inst_SOPC__S_BITCMP1_B32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u[S1.u[4:0]] == 1).
-    void
-    Inst_SOPC__S_BITCMP1_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = bits(src0.rawData(), bits(src1.rawData(), 4, 0)) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_BITCMP0_B64 class methods ---
-
-    Inst_SOPC__S_BITCMP0_B64::Inst_SOPC__S_BITCMP0_B64(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_bitcmp0_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_BITCMP0_B64
-
-    Inst_SOPC__S_BITCMP0_B64::~Inst_SOPC__S_BITCMP0_B64()
-    {
-    } // ~Inst_SOPC__S_BITCMP0_B64
-
-    // --- description from .arch file ---
-    // SCC = (S0.u64[S1.u[5:0]] == 0).
-    void
-    Inst_SOPC__S_BITCMP0_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = !bits(src0.rawData(), bits(src1.rawData(), 5, 0)) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_BITCMP1_B64 class methods ---
-
-    Inst_SOPC__S_BITCMP1_B64::Inst_SOPC__S_BITCMP1_B64(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_bitcmp1_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_BITCMP1_B64
-
-    Inst_SOPC__S_BITCMP1_B64::~Inst_SOPC__S_BITCMP1_B64()
-    {
-    } // ~Inst_SOPC__S_BITCMP1_B64
-
-    // --- description from .arch file ---
-    // SCC = (S0.u64[S1.u[5:0]] == 1).
-    void
-    Inst_SOPC__S_BITCMP1_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = bits(src0.rawData(), bits(src1.rawData(), 5, 0)) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_SETVSKIP class methods ---
-
-    Inst_SOPC__S_SETVSKIP::Inst_SOPC__S_SETVSKIP(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_setvskip")
-    {
-    } // Inst_SOPC__S_SETVSKIP
-
-    Inst_SOPC__S_SETVSKIP::~Inst_SOPC__S_SETVSKIP()
-    {
-    } // ~Inst_SOPC__S_SETVSKIP
-
-    // --- description from .arch file ---
-    // VSKIP = S0.u[S1.u[4:0]].
-    // Enables and disables VSKIP mode.
-    // When VSKIP is enabled, no VOP*/M*BUF/MIMG/DS/FLAT/EXP instuctions are
-    // issued.
-    // If any vector operations are outstanding, S_WAITCNT must be issued
-    // before executing.
-    // This instruction requires one waitstate after executing (e.g. S_NOP 0).
-    // Example:
-    //     s_waitcnt 0
-    //     s_setvskip 1, 0  // Enable vskip mode.
-    //     s_nop 1
-    void
-    Inst_SOPC__S_SETVSKIP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPC__S_SET_GPR_IDX_ON class methods ---
-
-    Inst_SOPC__S_SET_GPR_IDX_ON::Inst_SOPC__S_SET_GPR_IDX_ON(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_set_gpr_idx_on")
-    {
-    } // Inst_SOPC__S_SET_GPR_IDX_ON
-
-    Inst_SOPC__S_SET_GPR_IDX_ON::~Inst_SOPC__S_SET_GPR_IDX_ON()
-    {
-    } // ~Inst_SOPC__S_SET_GPR_IDX_ON
-
-    // --- description from .arch file ---
-    // MODE.gpr_idx_en = 1;
-    // M0[7:0] = S0.u[7:0];
-    // M0[15:12] = SIMM4 (direct contents of S1 field);
-    // // Remaining bits of M0 are unmodified.
-    // Enable GPR indexing mode. Vector operations after this will perform
-    // relative GPR addressing based on the contents of M0. The structure
-    // SQ_M0_GPR_IDX_WORD may be used to decode M0.
-    // The raw contents of the S1 field are read and used to set the enable
-    // bits. S1[0] = VSRC0_REL, S1[1] = VSRC1_REL, S1[2] = VSRC2_REL and
-    // S1[3] = VDST_REL.
-    void
-    Inst_SOPC__S_SET_GPR_IDX_ON::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPC__S_CMP_EQ_U64 class methods ---
-
-    Inst_SOPC__S_CMP_EQ_U64::Inst_SOPC__S_CMP_EQ_U64(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_eq_u64")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_EQ_U64
-
-    Inst_SOPC__S_CMP_EQ_U64::~Inst_SOPC__S_CMP_EQ_U64()
-    {
-    } // ~Inst_SOPC__S_CMP_EQ_U64
-
-    // --- description from .arch file ---
-    // SCC = (S0.i64 == S1.i64).
-    void
-    Inst_SOPC__S_CMP_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() == src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_LG_U64 class methods ---
-
-    Inst_SOPC__S_CMP_LG_U64::Inst_SOPC__S_CMP_LG_U64(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_lg_u64")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_LG_U64
-
-    Inst_SOPC__S_CMP_LG_U64::~Inst_SOPC__S_CMP_LG_U64()
-    {
-    } // ~Inst_SOPC__S_CMP_LG_U64
-
-    // --- description from .arch file ---
-    // SCC = (S0.i64 != S1.i64).
-    void
-    Inst_SOPC__S_CMP_LG_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() != src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPP__S_NOP class methods ---
-
-    Inst_SOPP__S_NOP::Inst_SOPP__S_NOP(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_nop")
-    {
-        setFlag(Nop);
-    } // Inst_SOPP__S_NOP
-
-    Inst_SOPP__S_NOP::~Inst_SOPP__S_NOP()
-    {
-    } // ~Inst_SOPP__S_NOP
-
-    // --- description from .arch file ---
-    // Do nothing. Repeat NOP 1..8 times based on SIMM16[2:0] -- 0 = 1 time,
-    // 7 = 8 times.
-    // This instruction may be used to introduce wait states to resolve
-    // hazards; see the shader programming guide for details. Compare with
-    // S_SLEEP.
-    void
-    Inst_SOPP__S_NOP::execute(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_SOPP__S_ENDPGM class methods ---
-
-    Inst_SOPP__S_ENDPGM::Inst_SOPP__S_ENDPGM(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_endpgm")
-    {
-        setFlag(EndOfKernel);
-    } // Inst_SOPP__S_ENDPGM
-
-    Inst_SOPP__S_ENDPGM::~Inst_SOPP__S_ENDPGM()
-    {
-    } // ~Inst_SOPP__S_ENDPGM
-
-    // --- description from .arch file ---
-    // End of program; terminate wavefront.
-    // The hardware implicitly executes S_WAITCNT 0 before executing this
-    // ---  instruction.
-    // See S_ENDPGM_SAVED for the context-switch version of this instruction.
-    void
-    Inst_SOPP__S_ENDPGM::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ComputeUnit *cu = gpuDynInst->computeUnit();
-
-        // delete extra instructions fetched for completed work-items
-        wf->instructionBuffer.erase(wf->instructionBuffer.begin() + 1,
-            wf->instructionBuffer.end());
-
-        if (wf->pendingFetch) {
-            wf->dropFetch = true;
-        }
-
-        wf->computeUnit->fetchStage.fetchUnit(wf->simdId)
-            .flushBuf(wf->wfSlotId);
-        wf->setStatus(Wavefront::S_STOPPED);
-
-        int refCount = wf->computeUnit->getLds()
-            .decreaseRefCounter(wf->dispatchId, wf->wgId);
-
-        /**
-         * The parent WF of this instruction is exiting, therefore
-         * it should not participate in this barrier any longer. This
-         * prevents possible deadlock issues if WFs exit early.
-         */
-        int bar_id = WFBarrier::InvalidID;
-        if (wf->hasBarrier()) {
-            assert(wf->getStatus() != Wavefront::S_BARRIER);
-            bar_id = wf->barrierId();
-            assert(bar_id != WFBarrier::InvalidID);
-            wf->releaseBarrier();
-            cu->decMaxBarrierCnt(bar_id);
-            DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Exiting the "
-                    "program and decrementing max barrier count for "
-                    "barrier Id%d. New max count: %d.\n", cu->cu_id,
-                    wf->simdId, wf->wfSlotId, wf->wfDynId, bar_id,
-                    cu->maxBarrierCnt(bar_id));
-        }
-
-        DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
-            wf->computeUnit->cu_id, wf->wgId, refCount);
-
-        wf->computeUnit->registerManager->freeRegisters(wf);
-        wf->computeUnit->stats.completedWfs++;
-        wf->computeUnit->activeWaves--;
-
-        panic_if(wf->computeUnit->activeWaves < 0, "CU[%d] Active waves less "
-            "than zero\n", wf->computeUnit->cu_id);
-
-        DPRINTF(GPUExec, "Doing return for CU%d: WF[%d][%d][%d]\n",
-            wf->computeUnit->cu_id, wf->simdId, wf->wfSlotId, wf->wfDynId);
-
-        for (int i = 0; i < wf->vecReads.size(); i++) {
-            if (wf->rawDist.find(i) != wf->rawDist.end()) {
-                wf->stats.readsPerWrite.sample(wf->vecReads.at(i));
-            }
-        }
-        wf->vecReads.clear();
-        wf->rawDist.clear();
-        wf->lastInstExec = 0;
-
-        if (!refCount) {
-            /**
-             * If all WFs have finished, and hence the WG has finished,
-             * then we can free up the barrier belonging to the parent
-             * WG, but only if we actually used a barrier (i.e., more
-             * than one WF in the WG).
-             */
-            if (bar_id != WFBarrier::InvalidID) {
-                DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - All waves are "
-                        "now complete. Releasing barrier Id%d.\n", cu->cu_id,
-                        wf->simdId, wf->wfSlotId, wf->wfDynId,
-                        wf->barrierId());
-                cu->releaseBarrier(bar_id);
-            }
-
-           /**
-             * Last wavefront of the workgroup has executed return. If the
-             * workgroup is not the final one in the kernel, then simply
-             * retire it; however, if it is the final one, i.e., indicating
-             * the kernel end, then release operation (i.e., GL2 WB) is
-             * needed
-             */
-
-            //check whether the workgroup is indicating the kernel end, i.e.,
-            //the last workgroup in the kernel
-            bool kernelEnd =
-                wf->computeUnit->shader->dispatcher().isReachingKernelEnd(wf);
-
-            bool relNeeded =
-                wf->computeUnit->shader->impl_kern_end_rel;
-
-            //if it is not a kernel end, then retire the workgroup directly
-            if (!kernelEnd || !relNeeded) {
-                wf->computeUnit->shader->dispatcher().notifyWgCompl(wf);
-                wf->setStatus(Wavefront::S_STOPPED);
-                wf->computeUnit->stats.completedWGs++;
-
-                return;
-            }
-
-            /**
-             * if it is a kernel end, inject a memory sync, i.e., GL2 WB, and
-             * retire the workgroup after receving response.
-             * note that GL0V and GL1 are read only, and they just forward GL2
-             * WB request. When forwarding, GL1 send the request to all GL2 in
-             * the complex
-             */
-            setFlag(MemSync);
-            setFlag(GlobalSegment);
-            // Notify Memory System of Kernel Completion
-            // Kernel End = isKernel + isMemSync
-            wf->setStatus(Wavefront::S_RETURNING);
-            gpuDynInst->simdId = wf->simdId;
-            gpuDynInst->wfSlotId = wf->wfSlotId;
-            gpuDynInst->wfDynId = wf->wfDynId;
-
-            DPRINTF(GPUExec, "inject global memory fence for CU%d: "
-                            "WF[%d][%d][%d]\n", wf->computeUnit->cu_id,
-                            wf->simdId, wf->wfSlotId, wf->wfDynId);
-
-            // call shader to prepare the flush operations
-            wf->computeUnit->shader->prepareFlush(gpuDynInst);
-
-            wf->computeUnit->stats.completedWGs++;
-        } else {
-            wf->computeUnit->shader->dispatcher().scheduleDispatch();
-        }
-    } // execute
-
-    // --- Inst_SOPP__S_BRANCH class methods ---
-
-    Inst_SOPP__S_BRANCH::Inst_SOPP__S_BRANCH(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_branch")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_BRANCH
-
-    Inst_SOPP__S_BRANCH::~Inst_SOPP__S_BRANCH()
-    {
-    } // ~Inst_SOPP__S_BRANCH
-
-    // --- description from .arch file ---
-    // PC = PC + signext(SIMM16 * 4) + 4 (short jump).
-    // For a long jump, use S_SETPC.
-    void
-    Inst_SOPP__S_BRANCH::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        Addr pc = gpuDynInst->pc();
-        ScalarRegI16 simm16 = instData.SIMM16;
-
-        pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
-
-        wf->pc(pc);
-    } // execute
-    // --- Inst_SOPP__S_WAKEUP class methods ---
-
-    Inst_SOPP__S_WAKEUP::Inst_SOPP__S_WAKEUP(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_wakeup")
-    {
-    } // Inst_SOPP__S_WAKEUP
-
-    Inst_SOPP__S_WAKEUP::~Inst_SOPP__S_WAKEUP()
-    {
-    } // ~Inst_SOPP__S_WAKEUP
-
-    // --- description from .arch file ---
-    // Allow a wave to 'ping' all the other waves in its threadgroup to force
-    // them to wake up immediately from an S_SLEEP instruction. The ping is
-    // ignored if the waves are not sleeping.
-    // This allows for more efficient polling on a memory location. The waves
-    // which are polling can sit in a long S_SLEEP between memory reads, but
-    // the wave which writes the value can tell them all to wake up early now
-    // that the data is available. This is useful for fBarrier implementations
-    // (speedup).
-    // This method is also safe from races because if any wave misses the ping,
-    // everything still works fine (whoever missed it just completes their
-    // normal S_SLEEP).
-    void
-    Inst_SOPP__S_WAKEUP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_SCC0 class methods ---
-
-    Inst_SOPP__S_CBRANCH_SCC0::Inst_SOPP__S_CBRANCH_SCC0(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_scc0")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_CBRANCH_SCC0
-
-    Inst_SOPP__S_CBRANCH_SCC0::~Inst_SOPP__S_CBRANCH_SCC0()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_SCC0
-
-    // --- description from .arch file ---
-    // if(SCC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_SCC0::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        Addr pc = gpuDynInst->pc();
-        ScalarRegI16 simm16 = instData.SIMM16;
-        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        scc.read();
-
-        if (!scc.rawData()) {
-            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
-        }
-
-        wf->pc(pc);
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_SCC1 class methods ---
-
-    Inst_SOPP__S_CBRANCH_SCC1::Inst_SOPP__S_CBRANCH_SCC1(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_scc1")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_CBRANCH_SCC1
-
-    Inst_SOPP__S_CBRANCH_SCC1::~Inst_SOPP__S_CBRANCH_SCC1()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_SCC1
-
-    // --- description from .arch file ---
-    // if(SCC == 1) then PC = PC + signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_SCC1::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        Addr pc = gpuDynInst->pc();
-        ScalarRegI16 simm16 = instData.SIMM16;
-        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        scc.read();
-
-        if (scc.rawData()) {
-            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
-        }
-
-        wf->pc(pc);
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_VCCZ class methods ---
-
-    Inst_SOPP__S_CBRANCH_VCCZ::Inst_SOPP__S_CBRANCH_VCCZ(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_vccz")
-    {
-        setFlag(Branch);
-        setFlag(ReadsVCC);
-    } // Inst_SOPP__S_CBRANCH_VCCZ
-
-    Inst_SOPP__S_CBRANCH_VCCZ::~Inst_SOPP__S_CBRANCH_VCCZ()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_VCCZ
-
-    // --- description from .arch file ---
-    // if(VCC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_VCCZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-        Addr pc = gpuDynInst->pc();
-        ScalarRegI16 simm16 = instData.SIMM16;
-
-        vcc.read();
-
-        if (!vcc.rawData()) {
-            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
-        }
-
-        wf->pc(pc);
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_VCCNZ class methods ---
-
-    Inst_SOPP__S_CBRANCH_VCCNZ::Inst_SOPP__S_CBRANCH_VCCNZ(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_vccnz")
-    {
-        setFlag(Branch);
-        setFlag(ReadsVCC);
-    } // Inst_SOPP__S_CBRANCH_VCCNZ
-
-    Inst_SOPP__S_CBRANCH_VCCNZ::~Inst_SOPP__S_CBRANCH_VCCNZ()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_VCCNZ
-
-    // --- description from .arch file ---
-    // if(VCC != 0) then PC = PC + signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_VCCNZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        vcc.read();
-
-        if (vcc.rawData()) {
-            Addr pc = gpuDynInst->pc();
-            ScalarRegI16 simm16 = instData.SIMM16;
-            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
-            wf->pc(pc);
-        }
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_EXECZ class methods ---
-
-    Inst_SOPP__S_CBRANCH_EXECZ::Inst_SOPP__S_CBRANCH_EXECZ(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_execz")
-    {
-        setFlag(Branch);
-        setFlag(ReadsEXEC);
-    } // Inst_SOPP__S_CBRANCH_EXECZ
-
-    Inst_SOPP__S_CBRANCH_EXECZ::~Inst_SOPP__S_CBRANCH_EXECZ()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_EXECZ
-
-    // --- description from .arch file ---
-    // if(EXEC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_EXECZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (wf->execMask().none()) {
-            Addr pc = gpuDynInst->pc();
-            ScalarRegI16 simm16 = instData.SIMM16;
-            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
-            wf->pc(pc);
-        }
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_EXECNZ class methods ---
-
-    Inst_SOPP__S_CBRANCH_EXECNZ::Inst_SOPP__S_CBRANCH_EXECNZ(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_execnz")
-    {
-        setFlag(Branch);
-        setFlag(ReadsEXEC);
-    } // Inst_SOPP__S_CBRANCH_EXECNZ
-
-    Inst_SOPP__S_CBRANCH_EXECNZ::~Inst_SOPP__S_CBRANCH_EXECNZ()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_EXECNZ
-
-    // --- description from .arch file ---
-    // if(EXEC != 0) then PC = PC + signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_EXECNZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (wf->execMask().any()) {
-            Addr pc = gpuDynInst->pc();
-            ScalarRegI16 simm16 = instData.SIMM16;
-            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
-            wf->pc(pc);
-        }
-    } // execute
-    // --- Inst_SOPP__S_BARRIER class methods ---
-
-    Inst_SOPP__S_BARRIER::Inst_SOPP__S_BARRIER(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_barrier")
-    {
-        setFlag(MemBarrier);
-    } // Inst_SOPP__S_BARRIER
-
-    Inst_SOPP__S_BARRIER::~Inst_SOPP__S_BARRIER()
-    {
-    } // ~Inst_SOPP__S_BARRIER
-
-    // --- description from .arch file ---
-    // Synchronize waves within a threadgroup.
-    // If not all waves of the threadgroup have been created yet, waits for
-    // entire group before proceeding.
-    // If some waves in the threadgroup have already terminated, this waits on
-    // only the surviving waves.
-    // Barriers are legal inside trap handlers.
-    void
-    Inst_SOPP__S_BARRIER::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ComputeUnit *cu = gpuDynInst->computeUnit();
-
-        if (wf->hasBarrier()) {
-            int bar_id = wf->barrierId();
-            assert(wf->getStatus() == Wavefront::S_BARRIER);
-            cu->incNumAtBarrier(bar_id);
-            DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Stalling at "
-                    "barrier Id%d. %d waves now at barrier, %d waves "
-                    "remain.\n", cu->cu_id, wf->simdId, wf->wfSlotId,
-                    wf->wfDynId, bar_id, cu->numAtBarrier(bar_id),
-                    cu->numYetToReachBarrier(bar_id));
-        }
-    } // execute
-    // --- Inst_SOPP__S_SETKILL class methods ---
-
-    Inst_SOPP__S_SETKILL::Inst_SOPP__S_SETKILL(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_setkill")
-    {
-    } // Inst_SOPP__S_SETKILL
-
-    Inst_SOPP__S_SETKILL::~Inst_SOPP__S_SETKILL()
-    {
-    } // ~Inst_SOPP__S_SETKILL
-
-    // --- description from .arch file ---
-    // set KILL bit to value of SIMM16[0].
-    // Used primarily for debugging kill wave host command behavior.
-    void
-    Inst_SOPP__S_SETKILL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_WAITCNT class methods ---
-
-    Inst_SOPP__S_WAITCNT::Inst_SOPP__S_WAITCNT(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_waitcnt")
-    {
-        setFlag(ALU);
-        setFlag(Waitcnt);
-    } // Inst_SOPP__S_WAITCNT
-
-    Inst_SOPP__S_WAITCNT::~Inst_SOPP__S_WAITCNT()
-    {
-    } // ~Inst_SOPP__S_WAITCNT
-
-    // --- description from .arch file ---
-    // Wait for the counts of outstanding lds, vector-memory and
-    // ---  export/vmem-write-data to be at or below the specified levels.
-    // SIMM16[3:0] = vmcount (vector memory operations),
-    // SIMM16[6:4] = export/mem-write-data count,
-    // SIMM16[12:8] = LGKM_cnt (scalar-mem/GDS/LDS count).
-    void
-    Inst_SOPP__S_WAITCNT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 vm_cnt = 0;
-        ScalarRegI32 exp_cnt = 0;
-        ScalarRegI32 lgkm_cnt = 0;
-        vm_cnt = bits<ScalarRegI16>(instData.SIMM16, 3, 0);
-        exp_cnt = bits<ScalarRegI16>(instData.SIMM16, 6, 4);
-        lgkm_cnt = bits<ScalarRegI16>(instData.SIMM16, 12, 8);
-        gpuDynInst->wavefront()->setStatus(Wavefront::S_WAITCNT);
-        gpuDynInst->wavefront()->setWaitCnts(vm_cnt, exp_cnt, lgkm_cnt);
-    } // execute
-    // --- Inst_SOPP__S_SETHALT class methods ---
-
-    Inst_SOPP__S_SETHALT::Inst_SOPP__S_SETHALT(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_sethalt")
-    {
-    } // Inst_SOPP__S_SETHALT
-
-    Inst_SOPP__S_SETHALT::~Inst_SOPP__S_SETHALT()
-    {
-    } // ~Inst_SOPP__S_SETHALT
-
-    // --- description from .arch file ---
-    // Set HALT bit to value of SIMM16[0]; 1 = halt, 0 = resume.
-    // The halt flag is ignored while PRIV == 1 (inside trap handlers) but the
-    // shader will halt immediately after the handler returns if HALT is still
-    // set at that time.
-    void
-    Inst_SOPP__S_SETHALT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_SLEEP class methods ---
-
-    Inst_SOPP__S_SLEEP::Inst_SOPP__S_SLEEP(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_sleep")
-    {
-        setFlag(ALU);
-        setFlag(Sleep);
-    } // Inst_SOPP__S_SLEEP
-
-    Inst_SOPP__S_SLEEP::~Inst_SOPP__S_SLEEP()
-    {
-    } // ~Inst_SOPP__S_SLEEP
-
-    // --- description from .arch file ---
-    // Cause a wave to sleep for (64 * SIMM16[2:0] + 1..64) clocks.
-    // The exact amount of delay is approximate. Compare with S_NOP.
-    void
-    Inst_SOPP__S_SLEEP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)instData.SIMM16;
-        gpuDynInst->wavefront()->setStatus(Wavefront::S_STALLED_SLEEP);
-        // sleep duration is specified in multiples of 64 cycles
-        gpuDynInst->wavefront()->setSleepTime(64 * simm16);
-    } // execute
-    // --- Inst_SOPP__S_SETPRIO class methods ---
-
-    Inst_SOPP__S_SETPRIO::Inst_SOPP__S_SETPRIO(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_setprio")
-    {
-        setFlag(ALU);
-    } // Inst_SOPP__S_SETPRIO
-
-    Inst_SOPP__S_SETPRIO::~Inst_SOPP__S_SETPRIO()
-    {
-    } // ~Inst_SOPP__S_SETPRIO
-
-    // --- description from .arch file ---
-    // User settable wave priority is set to SIMM16[1:0]. 0 = lowest,
-    // 3 = highest.
-    // The overall wave priority is {SPIPrio[1:0] + UserPrio[1:0],
-    // WaveAge[3:0]}.
-    void
-    Inst_SOPP__S_SETPRIO::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegU16 simm16 = instData.SIMM16;
-        ScalarRegU32 userPrio = simm16 & 0x3;
-
-        warn_once("S_SETPRIO ignored -- Requested priority %d\n", userPrio);
-    } // execute
-    // --- Inst_SOPP__S_SENDMSG class methods ---
-
-    Inst_SOPP__S_SENDMSG::Inst_SOPP__S_SENDMSG(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_sendmsg")
-    {
-    } // Inst_SOPP__S_SENDMSG
-
-    Inst_SOPP__S_SENDMSG::~Inst_SOPP__S_SENDMSG()
-    {
-    } // ~Inst_SOPP__S_SENDMSG
-
-    // --- description from .arch file ---
-    // Send a message upstream to VGT or the interrupt handler.
-    // SIMM16[9:0] contains the message type and is documented in the shader
-    // ---  programming guide.
-    void
-    Inst_SOPP__S_SENDMSG::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_SENDMSGHALT class methods ---
-
-    Inst_SOPP__S_SENDMSGHALT::Inst_SOPP__S_SENDMSGHALT(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_sendmsghalt")
-    {
-    } // Inst_SOPP__S_SENDMSGHALT
-
-    Inst_SOPP__S_SENDMSGHALT::~Inst_SOPP__S_SENDMSGHALT()
-    {
-    } // ~Inst_SOPP__S_SENDMSGHALT
-
-    // --- description from .arch file ---
-    // Send a message and then HALT the wavefront; see S_SENDMSG for details.
-    void
-    Inst_SOPP__S_SENDMSGHALT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_TRAP class methods ---
-
-    Inst_SOPP__S_TRAP::Inst_SOPP__S_TRAP(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_trap")
-    {
-    } // Inst_SOPP__S_TRAP
-
-    Inst_SOPP__S_TRAP::~Inst_SOPP__S_TRAP()
-    {
-    } // ~Inst_SOPP__S_TRAP
-
-    // --- description from .arch file ---
-    // TrapID = SIMM16[7:0];
-    // Wait for all instructions to complete;
-    // set {TTMP1, TTMP0} = {3'h0, PCRewind[3:0], HT[0], TrapID[7:0],
-    // PC[47:0]};
-    // PC = TBA (trap base address);
-    // PRIV = 1.
-    // Enter the trap handler. This instruction may be generated internally as
-    // well in response to a host trap (HT = 1) or an exception.
-    // TrapID 0 is reserved for hardware use and should not be used in a
-    // shader-generated trap.
-    void
-    Inst_SOPP__S_TRAP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_ICACHE_INV class methods ---
-
-    Inst_SOPP__S_ICACHE_INV::Inst_SOPP__S_ICACHE_INV(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_icache_inv")
-    {
-    } // Inst_SOPP__S_ICACHE_INV
-
-    Inst_SOPP__S_ICACHE_INV::~Inst_SOPP__S_ICACHE_INV()
-    {
-    } // ~Inst_SOPP__S_ICACHE_INV
-
-    // --- description from .arch file ---
-    // Invalidate entire L1 instruction cache.
-    // You must have 12 separate S_NOP instructions or a jump/branch
-    // instruction after this instruction
-    // to ensure the SQ instruction buffer is purged.
-    void
-    Inst_SOPP__S_ICACHE_INV::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_INCPERFLEVEL class methods ---
-
-    Inst_SOPP__S_INCPERFLEVEL::Inst_SOPP__S_INCPERFLEVEL(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_incperflevel")
-    {
-    } // Inst_SOPP__S_INCPERFLEVEL
-
-    Inst_SOPP__S_INCPERFLEVEL::~Inst_SOPP__S_INCPERFLEVEL()
-    {
-    } // ~Inst_SOPP__S_INCPERFLEVEL
-
-    // --- description from .arch file ---
-    // Increment performance counter specified in SIMM16[3:0] by 1.
-    void
-    Inst_SOPP__S_INCPERFLEVEL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_DECPERFLEVEL class methods ---
-
-    Inst_SOPP__S_DECPERFLEVEL::Inst_SOPP__S_DECPERFLEVEL(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_decperflevel")
-    {
-    } // Inst_SOPP__S_DECPERFLEVEL
-
-    Inst_SOPP__S_DECPERFLEVEL::~Inst_SOPP__S_DECPERFLEVEL()
-    {
-    } // ~Inst_SOPP__S_DECPERFLEVEL
-
-    // --- description from .arch file ---
-    // Decrement performance counter specified in SIMM16[3:0] by 1.
-    void
-    Inst_SOPP__S_DECPERFLEVEL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_TTRACEDATA class methods ---
-
-    Inst_SOPP__S_TTRACEDATA::Inst_SOPP__S_TTRACEDATA(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_ttracedata")
-    {
-    } // Inst_SOPP__S_TTRACEDATA
-
-    Inst_SOPP__S_TTRACEDATA::~Inst_SOPP__S_TTRACEDATA()
-    {
-    } // ~Inst_SOPP__S_TTRACEDATA
-
-    // --- description from .arch file ---
-    // Send M0 as user data to the thread trace stream.
-    void
-    Inst_SOPP__S_TTRACEDATA::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_CDBGSYS class methods ---
-
-    Inst_SOPP__S_CBRANCH_CDBGSYS::Inst_SOPP__S_CBRANCH_CDBGSYS(
-          InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_cdbgsys")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_CBRANCH_CDBGSYS
-
-    Inst_SOPP__S_CBRANCH_CDBGSYS::~Inst_SOPP__S_CBRANCH_CDBGSYS()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_CDBGSYS
-
-    // --- description from .arch file ---
-    // if(conditional_debug_system != 0) then PC = PC + signext(SIMM16 * 4)
-    // + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_CDBGSYS::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_CDBGUSER class methods ---
-
-    Inst_SOPP__S_CBRANCH_CDBGUSER::Inst_SOPP__S_CBRANCH_CDBGUSER(
-          InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_cdbguser")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_CBRANCH_CDBGUSER
-
-    Inst_SOPP__S_CBRANCH_CDBGUSER::~Inst_SOPP__S_CBRANCH_CDBGUSER()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_CDBGUSER
-
-    // --- description from .arch file ---
-    // if(conditional_debug_user != 0) then PC = PC + signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_CDBGUSER::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER class methods ---
-
-    Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER(
-          InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_cdbgsys_or_user")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER
-
-    Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::
-        ~Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER
-
-    // --- description from .arch file ---
-    // if(conditional_debug_system || conditional_debug_user) then PC = PC +
-    // ---  signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER class methods ---
-
-    Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::
-        Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER(InFmt_SOPP *iFmt)
-            : Inst_SOPP(iFmt, "s_cbranch_cdbgsys_and_user")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER
-
-    Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::
-        ~Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER
-
-    // --- description from .arch file ---
-    // if(conditional_debug_system && conditional_debug_user) then PC = PC +
-    // ---  signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_ENDPGM_SAVED class methods ---
-
-    Inst_SOPP__S_ENDPGM_SAVED::Inst_SOPP__S_ENDPGM_SAVED(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_endpgm_saved")
-    {
-    } // Inst_SOPP__S_ENDPGM_SAVED
-
-    Inst_SOPP__S_ENDPGM_SAVED::~Inst_SOPP__S_ENDPGM_SAVED()
-    {
-    } // ~Inst_SOPP__S_ENDPGM_SAVED
-
-    // --- description from .arch file ---
-    // End of program; signal that a wave has been saved by the context-switch
-    // trap handler and terminate wavefront.
-    // The hardware implicitly executes S_WAITCNT 0 before executing this
-    // instruction.
-    // Use S_ENDPGM in all cases unless you are executing the context-switch
-    // save handler.
-    void
-    Inst_SOPP__S_ENDPGM_SAVED::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_SET_GPR_IDX_OFF class methods ---
-
-    Inst_SOPP__S_SET_GPR_IDX_OFF::Inst_SOPP__S_SET_GPR_IDX_OFF(
-          InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_set_gpr_idx_off")
-    {
-    } // Inst_SOPP__S_SET_GPR_IDX_OFF
-
-    Inst_SOPP__S_SET_GPR_IDX_OFF::~Inst_SOPP__S_SET_GPR_IDX_OFF()
-    {
-    } // ~Inst_SOPP__S_SET_GPR_IDX_OFF
-
-    // --- description from .arch file ---
-    // MODE.gpr_idx_en = 0.
-    // Clear GPR indexing mode. Vector operations after this will not perform
-    // ---  relative GPR addressing regardless of the contents of M0. This
-    // ---  instruction does not modify M0.
-    void
-    Inst_SOPP__S_SET_GPR_IDX_OFF::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_SET_GPR_IDX_MODE class methods ---
-
-    Inst_SOPP__S_SET_GPR_IDX_MODE::Inst_SOPP__S_SET_GPR_IDX_MODE(
-          InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_set_gpr_idx_mode")
-    {
-    } // Inst_SOPP__S_SET_GPR_IDX_MODE
-
-    Inst_SOPP__S_SET_GPR_IDX_MODE::~Inst_SOPP__S_SET_GPR_IDX_MODE()
-    {
-    } // ~Inst_SOPP__S_SET_GPR_IDX_MODE
-
-    // --- description from .arch file ---
-    // M0[15:12] = SIMM4.
-    // Modify the mode used for vector GPR indexing.
-    // The raw contents of the source field are read and used to set the enable
-    // bits. SIMM4[0] = VSRC0_REL, SIMM4[1] = VSRC1_REL, SIMM4[2] = VSRC2_REL
-    // and SIMM4[3] = VDST_REL.
-    void
-    Inst_SOPP__S_SET_GPR_IDX_MODE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SMEM__S_LOAD_DWORD class methods ---
-
-    Inst_SMEM__S_LOAD_DWORD::Inst_SMEM__S_LOAD_DWORD(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_load_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_LOAD_DWORD
-
-    Inst_SMEM__S_LOAD_DWORD::~Inst_SMEM__S_LOAD_DWORD()
-    {
-    } // ~Inst_SMEM__S_LOAD_DWORD
-
-    /**
-     * Read 1 dword from scalar data cache. If the offset is specified as an
-     * sgpr, the sgpr contains an unsigned byte offset (the 2 LSBs are
-     * ignored). If the offset is specified as an immediate 20-bit constant,
-     * the constant is an unsigned byte offset.
-     */
-    void
-    Inst_SMEM__S_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-
-        addr.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe
-            .issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<1>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_LOAD_DWORDX2 class methods ---
-
-    Inst_SMEM__S_LOAD_DWORDX2::Inst_SMEM__S_LOAD_DWORDX2(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_load_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_LOAD_DWORDX2
-
-    Inst_SMEM__S_LOAD_DWORDX2::~Inst_SMEM__S_LOAD_DWORDX2()
-    {
-    } // ~Inst_SMEM__S_LOAD_DWORDX2
-
-    /**
-     * Read 2 dwords from scalar data cache. See s_load_dword for details on
-     * the offset input.
-     */
-    void
-    Inst_SMEM__S_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-
-        addr.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe.
-            issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<2>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_LOAD_DWORDX4 class methods ---
-
-    Inst_SMEM__S_LOAD_DWORDX4::Inst_SMEM__S_LOAD_DWORDX4(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_load_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_LOAD_DWORDX4
-
-    Inst_SMEM__S_LOAD_DWORDX4::~Inst_SMEM__S_LOAD_DWORDX4()
-    {
-    } // ~Inst_SMEM__S_LOAD_DWORDX4
-
-    // --- description from .arch file ---
-    // Read 4 dwords from scalar data cache. See S_LOAD_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-
-        addr.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe.
-            issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<4>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarOperandU128 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_LOAD_DWORDX8 class methods ---
-
-    Inst_SMEM__S_LOAD_DWORDX8::Inst_SMEM__S_LOAD_DWORDX8(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_load_dwordx8")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_LOAD_DWORDX8
-
-    Inst_SMEM__S_LOAD_DWORDX8::~Inst_SMEM__S_LOAD_DWORDX8()
-    {
-    } // ~Inst_SMEM__S_LOAD_DWORDX8
-
-    // --- description from .arch file ---
-    // Read 8 dwords from scalar data cache. See S_LOAD_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_LOAD_DWORDX8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-
-        addr.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe.
-            issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX8::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<8>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX8::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarOperandU256 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_LOAD_DWORDX16 class methods ---
-
-    Inst_SMEM__S_LOAD_DWORDX16::Inst_SMEM__S_LOAD_DWORDX16(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_load_dwordx16")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_LOAD_DWORDX16
-
-    Inst_SMEM__S_LOAD_DWORDX16::~Inst_SMEM__S_LOAD_DWORDX16()
-    {
-    } // ~Inst_SMEM__S_LOAD_DWORDX16
-
-    // --- description from .arch file ---
-    // Read 16 dwords from scalar data cache. See S_LOAD_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_LOAD_DWORDX16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-
-        addr.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe.
-            issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX16::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<16>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX16::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarOperandU512 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_BUFFER_LOAD_DWORD class methods ---
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORD::Inst_SMEM__S_BUFFER_LOAD_DWORD(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_load_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_BUFFER_LOAD_DWORD
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORD::~Inst_SMEM__S_BUFFER_LOAD_DWORD()
-    {
-    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORD
-
-    // --- description from .arch file ---
-    // Read 1 dword from scalar data cache. See S_LOAD_DWORD for details on the
-    // ---  offset input.
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
-
-        rsrcDesc.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, rsrcDesc, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe
-            .issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<1>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        // 1 request, size 32
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_BUFFER_LOAD_DWORDX2 class methods ---
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::Inst_SMEM__S_BUFFER_LOAD_DWORDX2(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_load_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX2
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::~Inst_SMEM__S_BUFFER_LOAD_DWORDX2()
-    {
-    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX2
-
-    // --- description from .arch file ---
-    // Read 2 dwords from scalar data cache. See S_LOAD_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
-
-        rsrcDesc.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, rsrcDesc, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe
-            .issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<2>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        // use U64 because 2 requests, each size 32
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_BUFFER_LOAD_DWORDX4 class methods ---
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::Inst_SMEM__S_BUFFER_LOAD_DWORDX4(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_load_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX4
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::~Inst_SMEM__S_BUFFER_LOAD_DWORDX4()
-    {
-    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX4
-
-    // --- description from .arch file ---
-    // Read 4 dwords from scalar data cache. See S_LOAD_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
-
-        rsrcDesc.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, rsrcDesc, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe
-            .issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<4>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        // 4 requests, each size 32
-        ScalarOperandU128 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_BUFFER_LOAD_DWORDX8 class methods ---
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::Inst_SMEM__S_BUFFER_LOAD_DWORDX8(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_load_dwordx8")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX8
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::~Inst_SMEM__S_BUFFER_LOAD_DWORDX8()
-    {
-    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX8
-
-    // --- description from .arch file ---
-    // Read 8 dwords from scalar data cache. See S_LOAD_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
-
-        rsrcDesc.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, rsrcDesc, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe
-            .issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<8>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        // 8 requests, each size 32
-        ScalarOperandU256 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_BUFFER_LOAD_DWORDX16 class methods ---
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::Inst_SMEM__S_BUFFER_LOAD_DWORDX16(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_load_dwordx16")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX16
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::~Inst_SMEM__S_BUFFER_LOAD_DWORDX16()
-    {
-    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX16
-
-    // --- description from .arch file ---
-    // Read 16 dwords from scalar data cache. See S_LOAD_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
-
-        rsrcDesc.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, rsrcDesc, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe
-            .issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<16>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        // 16 requests, each size 32
-        ScalarOperandU512 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_STORE_DWORD class methods ---
-
-    Inst_SMEM__S_STORE_DWORD::Inst_SMEM__S_STORE_DWORD(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_store_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_SMEM__S_STORE_DWORD
-
-    Inst_SMEM__S_STORE_DWORD::~Inst_SMEM__S_STORE_DWORD()
-    {
-    } // ~Inst_SMEM__S_STORE_DWORD
-
-    // --- description from .arch file ---
-    // Write 1 dword to scalar data cache.
-    // If the offset is specified as an SGPR, the SGPR contains an unsigned
-    // BYTE offset (the 2 LSBs are ignored).
-    // If the offset is specified as an immediate 20-bit constant, the
-    // constant is an unsigned BYTE offset.
-    void
-    Inst_SMEM__S_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-        ConstScalarOperandU32 sdata(gpuDynInst, instData.SDATA);
-
-        addr.read();
-        sdata.read();
-
-        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
-            sizeof(ScalarRegU32));
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe.
-            issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<1>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_SMEM__S_STORE_DWORDX2 class methods ---
-
-    Inst_SMEM__S_STORE_DWORDX2::Inst_SMEM__S_STORE_DWORDX2(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_store_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_SMEM__S_STORE_DWORDX2
-
-    Inst_SMEM__S_STORE_DWORDX2::~Inst_SMEM__S_STORE_DWORDX2()
-    {
-    } // ~Inst_SMEM__S_STORE_DWORDX2
-
-    // --- description from .arch file ---
-    // Write 2 dwords to scalar data cache. See S_STORE_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-        ConstScalarOperandU64 sdata(gpuDynInst, instData.SDATA);
-
-        addr.read();
-        sdata.read();
-
-        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
-            sizeof(ScalarRegU64));
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe.
-            issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<2>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_SMEM__S_STORE_DWORDX4 class methods ---
-
-    Inst_SMEM__S_STORE_DWORDX4::Inst_SMEM__S_STORE_DWORDX4(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_store_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_SMEM__S_STORE_DWORDX4
-
-    Inst_SMEM__S_STORE_DWORDX4::~Inst_SMEM__S_STORE_DWORDX4()
-    {
-    } // ~Inst_SMEM__S_STORE_DWORDX4
-
-    // --- description from .arch file ---
-    // Write 4 dwords to scalar data cache. See S_STORE_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-        ConstScalarOperandU64 sdata(gpuDynInst, instData.SDATA);
-
-        addr.read();
-        sdata.read();
-
-        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
-            sizeof(gpuDynInst->scalar_data));
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe.
-            issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<4>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_SMEM__S_BUFFER_STORE_DWORD class methods ---
-
-    Inst_SMEM__S_BUFFER_STORE_DWORD::Inst_SMEM__S_BUFFER_STORE_DWORD(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_store_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_SMEM__S_BUFFER_STORE_DWORD
-
-    Inst_SMEM__S_BUFFER_STORE_DWORD::~Inst_SMEM__S_BUFFER_STORE_DWORD()
-    {
-    } // ~Inst_SMEM__S_BUFFER_STORE_DWORD
-
-    // --- description from .arch file ---
-    // Write 1 dword to scalar data cache. See S_STORE_DWORD for details on the
-    // ---  offset input.
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_SMEM__S_BUFFER_STORE_DWORDX2 class methods ---
-
-    Inst_SMEM__S_BUFFER_STORE_DWORDX2::Inst_SMEM__S_BUFFER_STORE_DWORDX2(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_store_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_SMEM__S_BUFFER_STORE_DWORDX2
-
-    Inst_SMEM__S_BUFFER_STORE_DWORDX2::~Inst_SMEM__S_BUFFER_STORE_DWORDX2()
-    {
-    } // ~Inst_SMEM__S_BUFFER_STORE_DWORDX2
-
-    // --- description from .arch file ---
-    // Write 2 dwords to scalar data cache. See S_STORE_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_SMEM__S_BUFFER_STORE_DWORDX4 class methods ---
-
-    Inst_SMEM__S_BUFFER_STORE_DWORDX4::Inst_SMEM__S_BUFFER_STORE_DWORDX4(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_store_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_SMEM__S_BUFFER_STORE_DWORDX4
-
-    Inst_SMEM__S_BUFFER_STORE_DWORDX4::~Inst_SMEM__S_BUFFER_STORE_DWORDX4()
-    {
-    } // ~Inst_SMEM__S_BUFFER_STORE_DWORDX4
-
-    // --- description from .arch file ---
-    // Write 4 dwords to scalar data cache. See S_STORE_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_SMEM__S_DCACHE_INV class methods ---
-
-    Inst_SMEM__S_DCACHE_INV::Inst_SMEM__S_DCACHE_INV(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_dcache_inv")
-    {
-    } // Inst_SMEM__S_DCACHE_INV
-
-    Inst_SMEM__S_DCACHE_INV::~Inst_SMEM__S_DCACHE_INV()
-    {
-    } // ~Inst_SMEM__S_DCACHE_INV
-
-    // --- description from .arch file ---
-    // Invalidate the scalar data cache.
-    void
-    Inst_SMEM__S_DCACHE_INV::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SMEM__S_DCACHE_WB class methods ---
-
-    Inst_SMEM__S_DCACHE_WB::Inst_SMEM__S_DCACHE_WB(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_dcache_wb")
-    {
-    } // Inst_SMEM__S_DCACHE_WB
-
-    Inst_SMEM__S_DCACHE_WB::~Inst_SMEM__S_DCACHE_WB()
-    {
-    } // ~Inst_SMEM__S_DCACHE_WB
-
-    // --- description from .arch file ---
-    // Write back dirty data in the scalar data cache.
-    void
-    Inst_SMEM__S_DCACHE_WB::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SMEM__S_DCACHE_INV_VOL class methods ---
-
-    Inst_SMEM__S_DCACHE_INV_VOL::Inst_SMEM__S_DCACHE_INV_VOL(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_dcache_inv_vol")
-    {
-    } // Inst_SMEM__S_DCACHE_INV_VOL
-
-    Inst_SMEM__S_DCACHE_INV_VOL::~Inst_SMEM__S_DCACHE_INV_VOL()
-    {
-    } // ~Inst_SMEM__S_DCACHE_INV_VOL
-
-    // --- description from .arch file ---
-    // Invalidate the scalar data cache volatile lines.
-    void
-    Inst_SMEM__S_DCACHE_INV_VOL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SMEM__S_DCACHE_WB_VOL class methods ---
-
-    Inst_SMEM__S_DCACHE_WB_VOL::Inst_SMEM__S_DCACHE_WB_VOL(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_dcache_wb_vol")
-    {
-    } // Inst_SMEM__S_DCACHE_WB_VOL
-
-    Inst_SMEM__S_DCACHE_WB_VOL::~Inst_SMEM__S_DCACHE_WB_VOL()
-    {
-    } // ~Inst_SMEM__S_DCACHE_WB_VOL
-
-    // --- description from .arch file ---
-    // Write back dirty data in the scalar data cache volatile lines.
-    void
-    Inst_SMEM__S_DCACHE_WB_VOL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SMEM__S_MEMTIME class methods ---
-
-    Inst_SMEM__S_MEMTIME::Inst_SMEM__S_MEMTIME(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_memtime")
-    {
-        // s_memtime does not issue a memory request
-        setFlag(ALU);
-    } // Inst_SMEM__S_MEMTIME
-
-    Inst_SMEM__S_MEMTIME::~Inst_SMEM__S_MEMTIME()
-    {
-    } // ~Inst_SMEM__S_MEMTIME
-
-    // --- description from .arch file ---
-    // Return current 64-bit timestamp.
-    void
-    Inst_SMEM__S_MEMTIME::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDATA);
-        sdst = (ScalarRegU64)gpuDynInst->computeUnit()->curCycle();
-        sdst.write();
-    } // execute
-    // --- Inst_SMEM__S_MEMREALTIME class methods ---
-
-    Inst_SMEM__S_MEMREALTIME::Inst_SMEM__S_MEMREALTIME(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_memrealtime")
-    {
-    } // Inst_SMEM__S_MEMREALTIME
-
-    Inst_SMEM__S_MEMREALTIME::~Inst_SMEM__S_MEMREALTIME()
-    {
-    } // ~Inst_SMEM__S_MEMREALTIME
-
-    // --- description from .arch file ---
-    // Return current 64-bit RTC.
-    void
-    Inst_SMEM__S_MEMREALTIME::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SMEM__S_ATC_PROBE class methods ---
-
-    Inst_SMEM__S_ATC_PROBE::Inst_SMEM__S_ATC_PROBE(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_atc_probe")
-    {
-    } // Inst_SMEM__S_ATC_PROBE
-
-    Inst_SMEM__S_ATC_PROBE::~Inst_SMEM__S_ATC_PROBE()
-    {
-    } // ~Inst_SMEM__S_ATC_PROBE
-
-    // --- description from .arch file ---
-    // Probe or prefetch an address into the SQC data cache.
-    void
-    Inst_SMEM__S_ATC_PROBE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SMEM__S_ATC_PROBE_BUFFER class methods ---
-
-    Inst_SMEM__S_ATC_PROBE_BUFFER::Inst_SMEM__S_ATC_PROBE_BUFFER(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_atc_probe_buffer")
-    {
-    } // Inst_SMEM__S_ATC_PROBE_BUFFER
-
-    Inst_SMEM__S_ATC_PROBE_BUFFER::~Inst_SMEM__S_ATC_PROBE_BUFFER()
-    {
-    } // ~Inst_SMEM__S_ATC_PROBE_BUFFER
-
-    // --- description from .arch file ---
-    // Probe or prefetch an address into the SQC data cache.
-    void
-    Inst_SMEM__S_ATC_PROBE_BUFFER::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_CNDMASK_B32 class methods ---
-
-    Inst_VOP2__V_CNDMASK_B32::Inst_VOP2__V_CNDMASK_B32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_cndmask_b32")
-    {
-        setFlag(ALU);
-        setFlag(ReadsVCC);
-    } // Inst_VOP2__V_CNDMASK_B32
-
-    Inst_VOP2__V_CNDMASK_B32::~Inst_VOP2__V_CNDMASK_B32()
-    {
-    } // ~Inst_VOP2__V_CNDMASK_B32
-
-    // --- description from .arch file ---
-    // D.u = (VCC[i] ? S1.u : S0.u) (i = threadID in wave); VOP3: specify VCC
-    // as a scalar GPR in S2.
-    void
-    Inst_VOP2__V_CNDMASK_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-        vcc.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane]
-                    = bits(vcc.rawData(), lane) ? src1[lane] : src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_ADD_F32 class methods ---
-
-    Inst_VOP2__V_ADD_F32::Inst_VOP2__V_ADD_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_add_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP2__V_ADD_F32
-
-    Inst_VOP2__V_ADD_F32::~Inst_VOP2__V_ADD_F32()
-    {
-    } // ~Inst_VOP2__V_ADD_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f + S1.f.
-    void
-    Inst_VOP2__V_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        VecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        if (isDPPInst()) {
-            VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
-            src0_dpp.read();
-
-            DPRINTF(VEGA, "Handling V_ADD_F32 SRC DPP. SRC0: register v[%d], "
-                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
-                    "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
-                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
-                    extData.iFmt_VOP_DPP.DPP_CTRL,
-                    extData.iFmt_VOP_DPP.SRC0_ABS,
-                    extData.iFmt_VOP_DPP.SRC0_NEG,
-                    extData.iFmt_VOP_DPP.SRC1_ABS,
-                    extData.iFmt_VOP_DPP.SRC1_NEG,
-                    extData.iFmt_VOP_DPP.BC,
-                    extData.iFmt_VOP_DPP.BANK_MASK,
-                    extData.iFmt_VOP_DPP.ROW_MASK);
-
-            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0_dpp[lane] + src1[lane];
-                }
-            }
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0[lane] + src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_SUB_F32 class methods ---
-
-    Inst_VOP2__V_SUB_F32::Inst_VOP2__V_SUB_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_sub_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP2__V_SUB_F32
-
-    Inst_VOP2__V_SUB_F32::~Inst_VOP2__V_SUB_F32()
-    {
-    } // ~Inst_VOP2__V_SUB_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f - S1.f.
-    // SQ translates to V_ADD_F32.
-    void
-    Inst_VOP2__V_SUB_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_SUBREV_F32 class methods ---
-
-    Inst_VOP2__V_SUBREV_F32::Inst_VOP2__V_SUBREV_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subrev_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP2__V_SUBREV_F32
-
-    Inst_VOP2__V_SUBREV_F32::~Inst_VOP2__V_SUBREV_F32()
-    {
-    } // ~Inst_VOP2__V_SUBREV_F32
-
-    // --- description from .arch file ---
-    // D.f = S1.f - S0.f.
-    // SQ translates to V_ADD_F32.
-    void
-    Inst_VOP2__V_SUBREV_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MUL_LEGACY_F32 class methods ---
-
-    Inst_VOP2__V_MUL_LEGACY_F32::Inst_VOP2__V_MUL_LEGACY_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_legacy_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP2__V_MUL_LEGACY_F32
-
-    Inst_VOP2__V_MUL_LEGACY_F32::~Inst_VOP2__V_MUL_LEGACY_F32()
-    {
-    } // ~Inst_VOP2__V_MUL_LEGACY_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f (DX9 rules, 0.0*x = 0.0).
-    void
-    Inst_VOP2__V_MUL_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] * src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MUL_F32 class methods ---
-
-    Inst_VOP2__V_MUL_F32::Inst_VOP2__V_MUL_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP2__V_MUL_F32
-
-    Inst_VOP2__V_MUL_F32::~Inst_VOP2__V_MUL_F32()
-    {
-    } // ~Inst_VOP2__V_MUL_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f.
-    void
-    Inst_VOP2__V_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isnan(src0[lane]) ||
-                    std::isnan(src1[lane])) {
-                    vdst[lane] = NAN;
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else {
-                    vdst[lane] = src0[lane] * src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MUL_I32_I24 class methods ---
-
-    Inst_VOP2__V_MUL_I32_I24::Inst_VOP2__V_MUL_I32_I24(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_i32_i24")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MUL_I32_I24
-
-    Inst_VOP2__V_MUL_I32_I24::~Inst_VOP2__V_MUL_I32_I24()
-    {
-    } // ~Inst_VOP2__V_MUL_I32_I24
-
-    // --- description from .arch file ---
-    // D.i = S0.i[23:0] * S1.i[23:0].
-    void
-    Inst_VOP2__V_MUL_I32_I24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
-                    * sext<24>(bits(src1[lane], 23, 0));
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MUL_HI_I32_I24 class methods ---
-
-    Inst_VOP2__V_MUL_HI_I32_I24::Inst_VOP2__V_MUL_HI_I32_I24(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_hi_i32_i24")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MUL_HI_I32_I24
-
-    Inst_VOP2__V_MUL_HI_I32_I24::~Inst_VOP2__V_MUL_HI_I32_I24()
-    {
-    } // ~Inst_VOP2__V_MUL_HI_I32_I24
-
-    // --- description from .arch file ---
-    // D.i = (S0.i[23:0] * S1.i[23:0])>>32.
-    void
-    Inst_VOP2__V_MUL_HI_I32_I24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI64 tmp_src0
-                    = (VecElemI64)sext<24>(bits(src0[lane], 23, 0));
-                VecElemI64 tmp_src1
-                    = (VecElemI64)sext<24>(bits(src1[lane], 23, 0));
-
-                vdst[lane] = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MUL_U32_U24 class methods ---
-
-    Inst_VOP2__V_MUL_U32_U24::Inst_VOP2__V_MUL_U32_U24(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_u32_u24")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MUL_U32_U24
-
-    Inst_VOP2__V_MUL_U32_U24::~Inst_VOP2__V_MUL_U32_U24()
-    {
-    } // ~Inst_VOP2__V_MUL_U32_U24
-
-    // --- description from .arch file ---
-    // D.u = S0.u[23:0] * S1.u[23:0].
-    void
-    Inst_VOP2__V_MUL_U32_U24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        auto opImpl = [](VecOperandU32& src0, VecOperandU32& src1,
-                         VecOperandU32& vdst, Wavefront* wf) {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = bits(src0[lane], 23, 0) *
-                                 bits(src1[lane], 23, 0);
-                }
-            }
-        };
-
-        vop2Helper<ConstVecOperandU32, VecOperandU32>(gpuDynInst, opImpl);
-    } // execute
-    // --- Inst_VOP2__V_MUL_HI_U32_U24 class methods ---
-
-    Inst_VOP2__V_MUL_HI_U32_U24::Inst_VOP2__V_MUL_HI_U32_U24(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_hi_u32_u24")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MUL_HI_U32_U24
-
-    Inst_VOP2__V_MUL_HI_U32_U24::~Inst_VOP2__V_MUL_HI_U32_U24()
-    {
-    } // ~Inst_VOP2__V_MUL_HI_U32_U24
-
-    // --- description from .arch file ---
-    // D.i = (S0.u[23:0] * S1.u[23:0])>>32.
-    void
-    Inst_VOP2__V_MUL_HI_U32_U24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemU64 tmp_src0 = (VecElemU64)bits(src0[lane], 23, 0);
-                VecElemU64 tmp_src1 = (VecElemU64)bits(src1[lane], 23, 0);
-                vdst[lane] = (VecElemU32)((tmp_src0 * tmp_src1) >> 32);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MIN_F32 class methods ---
-
-    Inst_VOP2__V_MIN_F32::Inst_VOP2__V_MIN_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_min_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP2__V_MIN_F32
-
-    Inst_VOP2__V_MIN_F32::~Inst_VOP2__V_MIN_F32()
-    {
-    } // ~Inst_VOP2__V_MIN_F32
-
-    // --- description from .arch file ---
-    // D.f = (S0.f < S1.f ? S0.f : S1.f).
-    void
-    Inst_VOP2__V_MIN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fmin(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MAX_F32 class methods ---
-
-    Inst_VOP2__V_MAX_F32::Inst_VOP2__V_MAX_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_max_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP2__V_MAX_F32
-
-    Inst_VOP2__V_MAX_F32::~Inst_VOP2__V_MAX_F32()
-    {
-    } // ~Inst_VOP2__V_MAX_F32
-
-    // --- description from .arch file ---
-    // D.f = (S0.f >= S1.f ? S0.f : S1.f).
-    void
-    Inst_VOP2__V_MAX_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fmax(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MIN_I32 class methods ---
-
-    Inst_VOP2__V_MIN_I32::Inst_VOP2__V_MIN_I32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_min_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MIN_I32
-
-    Inst_VOP2__V_MIN_I32::~Inst_VOP2__V_MIN_I32()
-    {
-    } // ~Inst_VOP2__V_MIN_I32
-
-    // --- description from .arch file ---
-    // D.i = min(S0.i, S1.i).
-    void
-    Inst_VOP2__V_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MAX_I32 class methods ---
-
-    Inst_VOP2__V_MAX_I32::Inst_VOP2__V_MAX_I32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_max_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MAX_I32
-
-    Inst_VOP2__V_MAX_I32::~Inst_VOP2__V_MAX_I32()
-    {
-    } // ~Inst_VOP2__V_MAX_I32
-
-    // --- description from .arch file ---
-    // D.i = max(S0.i, S1.i).
-    void
-    Inst_VOP2__V_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MIN_U32 class methods ---
-
-    Inst_VOP2__V_MIN_U32::Inst_VOP2__V_MIN_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_min_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MIN_U32
-
-    Inst_VOP2__V_MIN_U32::~Inst_VOP2__V_MIN_U32()
-    {
-    } // ~Inst_VOP2__V_MIN_U32
-
-    // --- description from .arch file ---
-    // D.u = min(S0.u, S1.u).
-    void
-    Inst_VOP2__V_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MAX_U32 class methods ---
-
-    Inst_VOP2__V_MAX_U32::Inst_VOP2__V_MAX_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_max_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MAX_U32
-
-    Inst_VOP2__V_MAX_U32::~Inst_VOP2__V_MAX_U32()
-    {
-    } // ~Inst_VOP2__V_MAX_U32
-
-    // --- description from .arch file ---
-    // D.u = max(S0.u, S1.u).
-    void
-    Inst_VOP2__V_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_LSHRREV_B32 class methods ---
-
-    Inst_VOP2__V_LSHRREV_B32::Inst_VOP2__V_LSHRREV_B32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_lshrrev_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_LSHRREV_B32
-
-    Inst_VOP2__V_LSHRREV_B32::~Inst_VOP2__V_LSHRREV_B32()
-    {
-    } // ~Inst_VOP2__V_LSHRREV_B32
-
-    // --- description from .arch file ---
-    // D.u = S1.u >> S0.u[4:0].
-    // The vacated bits are set to zero.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP2__V_LSHRREV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_ASHRREV_I32 class methods ---
-
-    Inst_VOP2__V_ASHRREV_I32::Inst_VOP2__V_ASHRREV_I32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_ashrrev_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_ASHRREV_I32
-
-    Inst_VOP2__V_ASHRREV_I32::~Inst_VOP2__V_ASHRREV_I32()
-    {
-    } // ~Inst_VOP2__V_ASHRREV_I32
-
-    // --- description from .arch file ---
-    // D.i = signext(S1.i) >> S0.i[4:0].
-    // The vacated bits are set to the sign bit of the input value.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP2__V_ASHRREV_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_LSHLREV_B32 class methods ---
-
-    Inst_VOP2__V_LSHLREV_B32::Inst_VOP2__V_LSHLREV_B32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_lshlrev_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_LSHLREV_B32
-
-    Inst_VOP2__V_LSHLREV_B32::~Inst_VOP2__V_LSHLREV_B32()
-    {
-    } // ~Inst_VOP2__V_LSHLREV_B32
-
-    // --- description from .arch file ---
-    // D.u = S1.u << S0.u[4:0].
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP2__V_LSHLREV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        if (isSDWAInst()) {
-            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
-            // use copies of original src0, src1, and vdst during selecting
-            VecOperandU32 origSrc0_sdwa(gpuDynInst,
-                                        extData.iFmt_VOP_SDWA.SRC0);
-            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
-            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
-
-            src0_sdwa.read();
-            origSrc0_sdwa.read();
-            origSrc1.read();
-
-            DPRINTF(VEGA, "Handling V_LSHLREV_B32 SRC SDWA. SRC0: register "
-                    "v[%d], DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: "
-                    "%d, SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: "
-                    "%d, SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
-                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
-                    extData.iFmt_VOP_SDWA.DST_U,
-                    extData.iFmt_VOP_SDWA.CLMP,
-                    extData.iFmt_VOP_SDWA.SRC0_SEL,
-                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC0_NEG,
-                    extData.iFmt_VOP_SDWA.SRC0_ABS,
-                    extData.iFmt_VOP_SDWA.SRC1_SEL,
-                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC1_NEG,
-                    extData.iFmt_VOP_SDWA.SRC1_ABS);
-
-            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
-                            src1, origSrc1);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src1[lane] << bits(src0_sdwa[lane], 4, 0);
-                    origVdst[lane] = vdst[lane]; // keep copy consistent
-                }
-            }
-
-            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_AND_B32 class methods ---
-
-    Inst_VOP2__V_AND_B32::Inst_VOP2__V_AND_B32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_and_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_AND_B32
-
-    Inst_VOP2__V_AND_B32::~Inst_VOP2__V_AND_B32()
-    {
-    } // ~Inst_VOP2__V_AND_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u & S1.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP2__V_AND_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        if (isDPPInst()) {
-            VecOperandU32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
-            src0_dpp.read();
-
-            DPRINTF(VEGA, "Handling V_AND_B32 SRC DPP. SRC0: register v[%d], "
-                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
-                    "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
-                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
-                    extData.iFmt_VOP_DPP.DPP_CTRL,
-                    extData.iFmt_VOP_DPP.SRC0_ABS,
-                    extData.iFmt_VOP_DPP.SRC0_NEG,
-                    extData.iFmt_VOP_DPP.SRC1_ABS,
-                    extData.iFmt_VOP_DPP.SRC1_NEG,
-                    extData.iFmt_VOP_DPP.BC,
-                    extData.iFmt_VOP_DPP.BANK_MASK,
-                    extData.iFmt_VOP_DPP.ROW_MASK);
-
-            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0_dpp[lane] & src1[lane];
-                }
-            }
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0[lane] & src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_OR_B32 class methods ---
-
-    Inst_VOP2__V_OR_B32::Inst_VOP2__V_OR_B32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_or_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_OR_B32
-
-    Inst_VOP2__V_OR_B32::~Inst_VOP2__V_OR_B32()
-    {
-    } // ~Inst_VOP2__V_OR_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u | S1.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP2__V_OR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        if (isSDWAInst()) {
-            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
-            // use copies of original src0, src1, and dest during selecting
-            VecOperandU32 origSrc0_sdwa(gpuDynInst,
-                                        extData.iFmt_VOP_SDWA.SRC0);
-            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
-            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
-
-            src0_sdwa.read();
-            origSrc0_sdwa.read();
-            origSrc1.read();
-
-            DPRINTF(VEGA, "Handling V_OR_B32 SRC SDWA. SRC0: register v[%d], "
-                    "DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, "
-                    "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
-                    "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
-                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
-                    extData.iFmt_VOP_SDWA.DST_U,
-                    extData.iFmt_VOP_SDWA.CLMP,
-                    extData.iFmt_VOP_SDWA.SRC0_SEL,
-                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC0_NEG,
-                    extData.iFmt_VOP_SDWA.SRC0_ABS,
-                    extData.iFmt_VOP_SDWA.SRC1_SEL,
-                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC1_NEG,
-                    extData.iFmt_VOP_SDWA.SRC1_ABS);
-
-            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
-                            src1, origSrc1);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0_sdwa[lane] | src1[lane];
-                    origVdst[lane] = vdst[lane]; // keep copy consistent
-                }
-            }
-
-            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0[lane] | src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_XOR_B32 class methods ---
-
-    Inst_VOP2__V_XOR_B32::Inst_VOP2__V_XOR_B32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_xor_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_XOR_B32
-
-    Inst_VOP2__V_XOR_B32::~Inst_VOP2__V_XOR_B32()
-    {
-    } // ~Inst_VOP2__V_XOR_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u ^ S1.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP2__V_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] ^ src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MAC_F32 class methods ---
-
-    Inst_VOP2__V_MAC_F32::Inst_VOP2__V_MAC_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mac_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(MAC);
-    } // Inst_VOP2__V_MAC_F32
-
-    Inst_VOP2__V_MAC_F32::~Inst_VOP2__V_MAC_F32()
-    {
-    } // ~Inst_VOP2__V_MAC_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f + D.f.
-    // SQ translates to V_MAD_F32.
-    void
-    Inst_VOP2__V_MAC_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        VecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-        vdst.read();
-
-        if (isDPPInst()) {
-            VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
-            src0_dpp.read();
-
-            DPRINTF(VEGA, "Handling V_MAC_F32 SRC DPP. SRC0: register v[%d], "
-                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
-                    "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
-                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
-                    extData.iFmt_VOP_DPP.DPP_CTRL,
-                    extData.iFmt_VOP_DPP.SRC0_ABS,
-                    extData.iFmt_VOP_DPP.SRC0_NEG,
-                    extData.iFmt_VOP_DPP.SRC1_ABS,
-                    extData.iFmt_VOP_DPP.SRC1_NEG,
-                    extData.iFmt_VOP_DPP.BC,
-                    extData.iFmt_VOP_DPP.BANK_MASK,
-                    extData.iFmt_VOP_DPP.ROW_MASK);
-
-            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = std::fma(src0_dpp[lane], src1[lane],
-                                          vdst[lane]);
-                }
-            }
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MADMK_F32 class methods ---
-
-    Inst_VOP2__V_MADMK_F32::Inst_VOP2__V_MADMK_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_madmk_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(MAD);
-    } // Inst_VOP2__V_MADMK_F32
-
-    Inst_VOP2__V_MADMK_F32::~Inst_VOP2__V_MADMK_F32()
-    {
-    } // ~Inst_VOP2__V_MADMK_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * K + S1.f; K is a 32-bit inline constant.
-    // This opcode cannot use the VOP3 encoding and cannot use input/output
-    // ---  modifiers.
-    // SQ translates to V_MAD_F32.
-    void
-    Inst_VOP2__V_MADMK_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-        VecElemF32 k = extData.imm_f32;
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], k, src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MADAK_F32 class methods ---
-
-    Inst_VOP2__V_MADAK_F32::Inst_VOP2__V_MADAK_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_madak_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(MAD);
-    } // Inst_VOP2__V_MADAK_F32
-
-    Inst_VOP2__V_MADAK_F32::~Inst_VOP2__V_MADAK_F32()
-    {
-    } // ~Inst_VOP2__V_MADAK_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f + K; K is a 32-bit inline constant.
-    // This opcode cannot use the VOP3 encoding and cannot use input/output
-    // ---  modifiers.
-    // SQ translates to V_MAD_F32.
-    void
-    Inst_VOP2__V_MADAK_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-        VecElemF32 k = extData.imm_f32;
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], k);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_ADD_CO_U32 class methods ---
-
-    Inst_VOP2__V_ADD_CO_U32::Inst_VOP2__V_ADD_CO_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_add_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-    } // Inst_VOP2__V_ADD_CO_U32
-
-    Inst_VOP2__V_ADD_CO_U32::~Inst_VOP2__V_ADD_CO_U32()
-    {
-    } // ~Inst_VOP2__V_ADD_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u + S1.u;
-    // VCC[threadId] = (S0.u + S1.u >= 0x800000000ULL ? 1 : 0) is an UNSIGNED
-    // ---  overflow or carry-out for V_ADDC_U32.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
-    void
-    Inst_VOP2__V_ADD_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        if (isSDWAInst()) {
-            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
-            // use copies of original src0, src1, and dest during selecting
-            VecOperandU32 origSrc0_sdwa(gpuDynInst,
-                                        extData.iFmt_VOP_SDWA.SRC0);
-            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
-            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
-
-            src0_sdwa.read();
-            origSrc0_sdwa.read();
-            origSrc1.read();
-
-            DPRINTF(VEGA, "Handling V_ADD_CO_U32 SRC SDWA. SRC0: register "
-                    "v[%d], DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, "
-                    "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
-                    "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
-                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
-                    extData.iFmt_VOP_SDWA.DST_U,
-                    extData.iFmt_VOP_SDWA.CLMP,
-                    extData.iFmt_VOP_SDWA.SRC0_SEL,
-                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC0_NEG,
-                    extData.iFmt_VOP_SDWA.SRC0_ABS,
-                    extData.iFmt_VOP_SDWA.SRC1_SEL,
-                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC1_NEG,
-                    extData.iFmt_VOP_SDWA.SRC1_ABS);
-
-            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
-                            src1, origSrc1);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0_sdwa[lane] + src1[lane];
-                    origVdst[lane] = vdst[lane]; // keep copy consistent
-                    vcc.setBit(lane, ((VecElemU64)src0_sdwa[lane]
-                        + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
-                }
-            }
-
-            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0[lane] + src1[lane];
-                    vcc.setBit(lane, ((VecElemU64)src0[lane]
-                        + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
-                }
-            }
-        }
-
-        vcc.write();
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_SUB_CO_U32 class methods ---
-
-    Inst_VOP2__V_SUB_CO_U32::Inst_VOP2__V_SUB_CO_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_sub_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-    } // Inst_VOP2__V_SUB_CO_U32
-
-    Inst_VOP2__V_SUB_CO_U32::~Inst_VOP2__V_SUB_CO_U32()
-    {
-    } // ~Inst_VOP2__V_SUB_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u - S1.u;
-    // VCC[threadId] = (S1.u > S0.u ? 1 : 0) is an UNSIGNED overflow or
-    // carry-out for V_SUBB_U32.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
-    void
-    Inst_VOP2__V_SUB_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-                vcc.setBit(lane, src1[lane] > src0[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    } // execute
-    // --- Inst_VOP2__V_SUBREV_CO_U32 class methods ---
-
-    Inst_VOP2__V_SUBREV_CO_U32::Inst_VOP2__V_SUBREV_CO_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subrev_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-    } // Inst_VOP2__V_SUBREV_CO_U32
-
-    Inst_VOP2__V_SUBREV_CO_U32::~Inst_VOP2__V_SUBREV_CO_U32()
-    {
-    } // ~Inst_VOP2__V_SUBREV_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S1.u - S0.u;
-    // VCC[threadId] = (S0.u > S1.u ? 1 : 0) is an UNSIGNED overflow or
-    // carry-out for V_SUBB_U32.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
-    void
-    Inst_VOP2__V_SUBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    } // execute
-    // --- Inst_VOP2__V_ADDC_CO_U32 class methods ---
-
-    Inst_VOP2__V_ADDC_CO_U32::Inst_VOP2__V_ADDC_CO_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_addc_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(ReadsVCC);
-    } // Inst_VOP2__V_ADDC_CO_U32
-
-    Inst_VOP2__V_ADDC_CO_U32::~Inst_VOP2__V_ADDC_CO_U32()
-    {
-    } // ~Inst_VOP2__V_ADDC_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u + S1.u + VCC[threadId];
-    // VCC[threadId] = (S0.u + S1.u + VCC[threadId] >= 0x800000000ULL ? 1 : 0)
-    // is an UNSIGNED overflow.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
-    // source comes from the SGPR-pair at S2.u.
-    void
-    Inst_VOP2__V_ADDC_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-        vcc.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane]
-                    + bits(vcc.rawData(), lane);
-                vcc.setBit(lane, ((VecElemU64)src0[lane]
-                    + (VecElemU64)src1[lane]
-                        + (VecElemU64)bits(vcc.rawData(), lane, lane))
-                            >= 0x100000000 ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    } // execute
-    // --- Inst_VOP2__V_SUBB_CO_U32 class methods ---
-
-    Inst_VOP2__V_SUBB_CO_U32::Inst_VOP2__V_SUBB_CO_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subb_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(ReadsVCC);
-    } // Inst_VOP2__V_SUBB_CO_U32
-
-    Inst_VOP2__V_SUBB_CO_U32::~Inst_VOP2__V_SUBB_CO_U32()
-    {
-    } // ~Inst_VOP2__V_SUBB_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u - S1.u - VCC[threadId];
-    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
-    // ---  overflow.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
-    // ---  source comes from the SGPR-pair at S2.u.
-    void
-    Inst_VOP2__V_SUBB_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-        vcc.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane]
-                    = src0[lane] - src1[lane] - bits(vcc.rawData(), lane);
-                vcc.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
-                    > src0[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    } // execute
-    // --- Inst_VOP2__V_SUBBREV_CO_U32 class methods ---
-
-    Inst_VOP2__V_SUBBREV_CO_U32::Inst_VOP2__V_SUBBREV_CO_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subbrev_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(ReadsVCC);
-    } // Inst_VOP2__V_SUBBREV_CO_U32
-
-    Inst_VOP2__V_SUBBREV_CO_U32::~Inst_VOP2__V_SUBBREV_CO_U32()
-    {
-    } // ~Inst_VOP2__V_SUBBREV_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S1.u - S0.u - VCC[threadId];
-    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
-    // overflow.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
-    // source comes from the SGPR-pair at S2.u. SQ translates to V_SUBB_U32.
-    // SQ translates this to V_SUBREV_U32 with reversed operands.
-    void
-    Inst_VOP2__V_SUBBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-        vcc.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane]
-                    = src1[lane] - src0[lane] - bits(vcc.rawData(), lane);
-                vcc.setBit(lane, (src0[lane] + bits(vcc.rawData(), lane))
-                    > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    } // execute
-    // --- Inst_VOP2__V_ADD_F16 class methods ---
-
-    Inst_VOP2__V_ADD_F16::Inst_VOP2__V_ADD_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_add_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP2__V_ADD_F16
-
-    Inst_VOP2__V_ADD_F16::~Inst_VOP2__V_ADD_F16()
-    {
-    } // ~Inst_VOP2__V_ADD_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 + S1.f16.
-    // Supports denormals, round mode, exception flags, saturation.
-    void
-    Inst_VOP2__V_ADD_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_SUB_F16 class methods ---
-
-    Inst_VOP2__V_SUB_F16::Inst_VOP2__V_SUB_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_sub_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP2__V_SUB_F16
-
-    Inst_VOP2__V_SUB_F16::~Inst_VOP2__V_SUB_F16()
-    {
-    } // ~Inst_VOP2__V_SUB_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 - S1.f16.
-    // Supports denormals, round mode, exception flags, saturation.
-    // SQ translates to V_ADD_F16.
-    void
-    Inst_VOP2__V_SUB_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_SUBREV_F16 class methods ---
-
-    Inst_VOP2__V_SUBREV_F16::Inst_VOP2__V_SUBREV_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subrev_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP2__V_SUBREV_F16
-
-    Inst_VOP2__V_SUBREV_F16::~Inst_VOP2__V_SUBREV_F16()
-    {
-    } // ~Inst_VOP2__V_SUBREV_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S1.f16 - S0.f16.
-    // Supports denormals, round mode, exception flags, saturation.
-    // SQ translates to V_ADD_F16.
-    void
-    Inst_VOP2__V_SUBREV_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_MUL_F16 class methods ---
-
-    Inst_VOP2__V_MUL_F16::Inst_VOP2__V_MUL_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP2__V_MUL_F16
-
-    Inst_VOP2__V_MUL_F16::~Inst_VOP2__V_MUL_F16()
-    {
-    } // ~Inst_VOP2__V_MUL_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * S1.f16.
-    // Supports denormals, round mode, exception flags, saturation.
-    void
-    Inst_VOP2__V_MUL_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_MAC_F16 class methods ---
-
-    Inst_VOP2__V_MAC_F16::Inst_VOP2__V_MAC_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mac_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(MAC);
-    } // Inst_VOP2__V_MAC_F16
-
-    Inst_VOP2__V_MAC_F16::~Inst_VOP2__V_MAC_F16()
-    {
-    } // ~Inst_VOP2__V_MAC_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * S1.f16 + D.f16.
-    // Supports round mode, exception flags, saturation.
-    // SQ translates this to V_MAD_F16.
-    void
-    Inst_VOP2__V_MAC_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_MADMK_F16 class methods ---
-
-    Inst_VOP2__V_MADMK_F16::Inst_VOP2__V_MADMK_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_madmk_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(MAD);
-    } // Inst_VOP2__V_MADMK_F16
-
-    Inst_VOP2__V_MADMK_F16::~Inst_VOP2__V_MADMK_F16()
-    {
-    } // ~Inst_VOP2__V_MADMK_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * K.f16 + S1.f16; K is a 16-bit inline constant stored
-    // in the following literal DWORD.
-    // This opcode cannot use the VOP3 encoding and cannot use input/output
-    // modifiers. Supports round mode, exception flags, saturation.
-    // SQ translates this to V_MAD_F16.
-    void
-    Inst_VOP2__V_MADMK_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_MADAK_F16 class methods ---
-
-    Inst_VOP2__V_MADAK_F16::Inst_VOP2__V_MADAK_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_madak_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(MAD);
-    } // Inst_VOP2__V_MADAK_F16
-
-    Inst_VOP2__V_MADAK_F16::~Inst_VOP2__V_MADAK_F16()
-    {
-    } // ~Inst_VOP2__V_MADAK_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * S1.f16 + K.f16; K is a 16-bit inline constant stored
-    // in the following literal DWORD.
-    // This opcode cannot use the VOP3 encoding and cannot use input/output
-    // modifiers. Supports round mode, exception flags, saturation.
-    // SQ translates this to V_MAD_F16.
-    void
-    Inst_VOP2__V_MADAK_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_ADD_U16 class methods ---
-
-    Inst_VOP2__V_ADD_U16::Inst_VOP2__V_ADD_U16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_add_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_ADD_U16
-
-    Inst_VOP2__V_ADD_U16::~Inst_VOP2__V_ADD_U16()
-    {
-    } // ~Inst_VOP2__V_ADD_U16
-
-    // --- description from .arch file ---
-    // D.u16 = S0.u16 + S1.u16.
-    // Supports saturation (unsigned 16-bit integer domain).
-    void
-    Inst_VOP2__V_ADD_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_SUB_U16 class methods ---
-
-    Inst_VOP2__V_SUB_U16::Inst_VOP2__V_SUB_U16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_sub_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_SUB_U16
-
-    Inst_VOP2__V_SUB_U16::~Inst_VOP2__V_SUB_U16()
-    {
-    } // ~Inst_VOP2__V_SUB_U16
-
-    // --- description from .arch file ---
-    // D.u16 = S0.u16 - S1.u16.
-    // Supports saturation (unsigned 16-bit integer domain).
-    void
-    Inst_VOP2__V_SUB_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_SUBREV_U16 class methods ---
-
-    Inst_VOP2__V_SUBREV_U16::Inst_VOP2__V_SUBREV_U16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subrev_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_SUBREV_U16
-
-    Inst_VOP2__V_SUBREV_U16::~Inst_VOP2__V_SUBREV_U16()
-    {
-    } // ~Inst_VOP2__V_SUBREV_U16
-
-    // --- description from .arch file ---
-    // D.u16 = S1.u16 - S0.u16.
-    // Supports saturation (unsigned 16-bit integer domain).
-    // SQ translates this to V_SUB_U16 with reversed operands.
-    void
-    Inst_VOP2__V_SUBREV_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MUL_LO_U16 class methods ---
-
-    Inst_VOP2__V_MUL_LO_U16::Inst_VOP2__V_MUL_LO_U16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_lo_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MUL_LO_U16
-
-    Inst_VOP2__V_MUL_LO_U16::~Inst_VOP2__V_MUL_LO_U16()
-    {
-    } // ~Inst_VOP2__V_MUL_LO_U16
-
-    // --- description from .arch file ---
-    // D.u16 = S0.u16 * S1.u16.
-    // Supports saturation (unsigned 16-bit integer domain).
-    void
-    Inst_VOP2__V_MUL_LO_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] * src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_LSHLREV_B16 class methods ---
-
-    Inst_VOP2__V_LSHLREV_B16::Inst_VOP2__V_LSHLREV_B16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_lshlrev_b16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_LSHLREV_B16
-
-    Inst_VOP2__V_LSHLREV_B16::~Inst_VOP2__V_LSHLREV_B16()
-    {
-    } // ~Inst_VOP2__V_LSHLREV_B16
-
-    // --- description from .arch file ---
-    // D.u[15:0] = S1.u[15:0] << S0.u[3:0].
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP2__V_LSHLREV_B16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] << bits(src0[lane], 3, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_LSHRREV_B16 class methods ---
-
-    Inst_VOP2__V_LSHRREV_B16::Inst_VOP2__V_LSHRREV_B16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_lshrrev_b16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_LSHRREV_B16
-
-    Inst_VOP2__V_LSHRREV_B16::~Inst_VOP2__V_LSHRREV_B16()
-    {
-    } // ~Inst_VOP2__V_LSHRREV_B16
-
-    // --- description from .arch file ---
-    // D.u[15:0] = S1.u[15:0] >> S0.u[3:0].
-    // The vacated bits are set to zero.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP2__V_LSHRREV_B16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_ASHRREV_I16 class methods ---
-
-    Inst_VOP2__V_ASHRREV_I16::Inst_VOP2__V_ASHRREV_I16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_ashrrev_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_ASHRREV_I16
-
-    Inst_VOP2__V_ASHRREV_I16::~Inst_VOP2__V_ASHRREV_I16()
-    {
-    } // ~Inst_VOP2__V_ASHRREV_I16
-
-    // --- description from .arch file ---
-    // D.i[15:0] = signext(S1.i[15:0]) >> S0.i[3:0].
-    // The vacated bits are set to the sign bit of the input value.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP2__V_ASHRREV_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MAX_F16 class methods ---
-
-    Inst_VOP2__V_MAX_F16::Inst_VOP2__V_MAX_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_max_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP2__V_MAX_F16
-
-    Inst_VOP2__V_MAX_F16::~Inst_VOP2__V_MAX_F16()
-    {
-    } // ~Inst_VOP2__V_MAX_F16
-
-    // --- description from .arch file ---
-    // D.f16 = max(S0.f16, S1.f16).
-    // IEEE compliant. Supports denormals, round mode, exception flags,
-    // saturation.
-    void
-    Inst_VOP2__V_MAX_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_MIN_F16 class methods ---
-
-    Inst_VOP2__V_MIN_F16::Inst_VOP2__V_MIN_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_min_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP2__V_MIN_F16
-
-    Inst_VOP2__V_MIN_F16::~Inst_VOP2__V_MIN_F16()
-    {
-    } // ~Inst_VOP2__V_MIN_F16
-
-    // --- description from .arch file ---
-    // D.f16 = min(S0.f16, S1.f16).
-    // IEEE compliant. Supports denormals, round mode, exception flags,
-    // saturation.
-    void
-    Inst_VOP2__V_MIN_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_MAX_U16 class methods ---
-
-    Inst_VOP2__V_MAX_U16::Inst_VOP2__V_MAX_U16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_max_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MAX_U16
-
-    Inst_VOP2__V_MAX_U16::~Inst_VOP2__V_MAX_U16()
-    {
-    } // ~Inst_VOP2__V_MAX_U16
-
-    // --- description from .arch file ---
-    // D.u[15:0] = max(S0.u[15:0], S1.u[15:0]).
-    void
-    Inst_VOP2__V_MAX_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MAX_I16 class methods ---
-
-    Inst_VOP2__V_MAX_I16::Inst_VOP2__V_MAX_I16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_max_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MAX_I16
-
-    Inst_VOP2__V_MAX_I16::~Inst_VOP2__V_MAX_I16()
-    {
-    } // ~Inst_VOP2__V_MAX_I16
-
-    // --- description from .arch file ---
-    // D.i[15:0] = max(S0.i[15:0], S1.i[15:0]).
-    void
-    Inst_VOP2__V_MAX_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MIN_U16 class methods ---
-
-    Inst_VOP2__V_MIN_U16::Inst_VOP2__V_MIN_U16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_min_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MIN_U16
-
-    Inst_VOP2__V_MIN_U16::~Inst_VOP2__V_MIN_U16()
-    {
-    } // ~Inst_VOP2__V_MIN_U16
-
-    // --- description from .arch file ---
-    // D.u[15:0] = min(S0.u[15:0], S1.u[15:0]).
-    void
-    Inst_VOP2__V_MIN_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MIN_I16 class methods ---
-
-    Inst_VOP2__V_MIN_I16::Inst_VOP2__V_MIN_I16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_min_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MIN_I16
-
-    Inst_VOP2__V_MIN_I16::~Inst_VOP2__V_MIN_I16()
-    {
-    } // ~Inst_VOP2__V_MIN_I16
-
-    // --- description from .arch file ---
-    // D.i[15:0] = min(S0.i[15:0], S1.i[15:0]).
-    void
-    Inst_VOP2__V_MIN_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_LDEXP_F16 class methods ---
-
-    Inst_VOP2__V_LDEXP_F16::Inst_VOP2__V_LDEXP_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_ldexp_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP2__V_LDEXP_F16
-
-    Inst_VOP2__V_LDEXP_F16::~Inst_VOP2__V_LDEXP_F16()
-    {
-    } // ~Inst_VOP2__V_LDEXP_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * (2 ** S1.i16).
-    void
-    Inst_VOP2__V_LDEXP_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_ADD_U32 class methods ---
-
-    Inst_VOP2__V_ADD_U32::Inst_VOP2__V_ADD_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_add_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_ADD_U32
-
-    Inst_VOP2__V_ADD_U32::~Inst_VOP2__V_ADD_U32()
-    {
-    } // ~Inst_VOP2__V_ADD_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u + S1.u;
-    void
-    Inst_VOP2__V_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        if (isSDWAInst()) {
-            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
-            // use copies of original src0, src1, and dest during selecting
-            VecOperandU32 origSrc0_sdwa(gpuDynInst,
-                                        extData.iFmt_VOP_SDWA.SRC0);
-            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
-            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
-
-            src0_sdwa.read();
-            origSrc0_sdwa.read();
-            origSrc1.read();
-
-            DPRINTF(VEGA, "Handling V_ADD_U32 SRC SDWA. SRC0: register v[%d], "
-                    "DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, "
-                    "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
-                    "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
-                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
-                    extData.iFmt_VOP_SDWA.DST_U,
-                    extData.iFmt_VOP_SDWA.CLMP,
-                    extData.iFmt_VOP_SDWA.SRC0_SEL,
-                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC0_NEG,
-                    extData.iFmt_VOP_SDWA.SRC0_ABS,
-                    extData.iFmt_VOP_SDWA.SRC1_SEL,
-                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC1_NEG,
-                    extData.iFmt_VOP_SDWA.SRC1_ABS);
-
-            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
-                            src1, origSrc1);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0_sdwa[lane] + src1[lane];
-                    origVdst[lane] = vdst[lane]; // keep copy consistent
-                }
-            }
-
-            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0[lane] + src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_SUB_U32 class methods ---
-
-    Inst_VOP2__V_SUB_U32::Inst_VOP2__V_SUB_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_sub_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_SUB_U32
-
-    Inst_VOP2__V_SUB_U32::~Inst_VOP2__V_SUB_U32()
-    {
-    } // ~Inst_VOP2__V_SUB_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u - S1.u;
-    void
-    Inst_VOP2__V_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_SUBREV_U32 class methods ---
-
-    Inst_VOP2__V_SUBREV_U32::Inst_VOP2__V_SUBREV_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subrev_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_SUBREV_U32
-
-    Inst_VOP2__V_SUBREV_U32::~Inst_VOP2__V_SUBREV_U32()
-    {
-    } // ~Inst_VOP2__V_SUBREV_U32
-
-    // --- description from .arch file ---
-    // D.u = S1.u - S0.u;
-    void
-    Inst_VOP2__V_SUBREV_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_FMAC_F32 class methods ---
-
-    Inst_VOP2__V_FMAC_F32::Inst_VOP2__V_FMAC_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_fmac_f32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_FMAC_F32
-
-    Inst_VOP2__V_FMAC_F32::~Inst_VOP2__V_FMAC_F32()
-    {
-    } // ~Inst_VOP2__V_FMAC_F32
-
-    // --- description from .arch file ---
-    // D.u = S1.u - S0.u;
-    void
-    Inst_VOP2__V_FMAC_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-        vdst.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_NOP class methods ---
-
-    Inst_VOP1__V_NOP::Inst_VOP1__V_NOP(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_nop")
-    {
-        setFlag(Nop);
-        setFlag(ALU);
-    } // Inst_VOP1__V_NOP
-
-    Inst_VOP1__V_NOP::~Inst_VOP1__V_NOP()
-    {
-    } // ~Inst_VOP1__V_NOP
-
-    // --- description from .arch file ---
-    // Do nothing.
-    void
-    Inst_VOP1__V_NOP::execute(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_VOP1__V_MOV_B32 class methods ---
-
-    Inst_VOP1__V_MOV_B32::Inst_VOP1__V_MOV_B32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_mov_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_MOV_B32
-
-    Inst_VOP1__V_MOV_B32::~Inst_VOP1__V_MOV_B32()
-    {
-    } // ~Inst_VOP1__V_MOV_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u.
-    // Input and output modifiers not supported; this is an untyped operation.
-    void
-    Inst_VOP1__V_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (isDPPInst()) {
-            VecOperandU32 src_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
-            src_dpp.read();
-
-            DPRINTF(VEGA, "Handling V_MOV_B32 SRC DPP. SRC0: register v[%d], "
-                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
-                    "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
-                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
-                    extData.iFmt_VOP_DPP.DPP_CTRL,
-                    extData.iFmt_VOP_DPP.SRC0_ABS,
-                    extData.iFmt_VOP_DPP.SRC0_NEG,
-                    extData.iFmt_VOP_DPP.SRC1_ABS,
-                    extData.iFmt_VOP_DPP.SRC1_NEG,
-                    extData.iFmt_VOP_DPP.BC,
-                    extData.iFmt_VOP_DPP.BANK_MASK,
-                    extData.iFmt_VOP_DPP.ROW_MASK);
-
-            // NOTE: For VOP1, there is no SRC1, so make sure we're not trying
-            // to negate it or take the absolute value of it
-            assert(!extData.iFmt_VOP_DPP.SRC1_ABS);
-            assert(!extData.iFmt_VOP_DPP.SRC1_NEG);
-            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src_dpp);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src_dpp[lane];
-                }
-            }
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_READFIRSTLANE_B32 class methods ---
-
-    Inst_VOP1__V_READFIRSTLANE_B32::Inst_VOP1__V_READFIRSTLANE_B32(
-          InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_readfirstlane_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_READFIRSTLANE_B32
-
-    Inst_VOP1__V_READFIRSTLANE_B32::~Inst_VOP1__V_READFIRSTLANE_B32()
-    {
-    } // ~Inst_VOP1__V_READFIRSTLANE_B32
-
-    // --- description from .arch file ---
-    // Copy one VGPR value to one SGPR. D = SGPR destination, S0 = source data
-    // (VGPR# or M0 for lds direct access), Lane# = FindFirst1fromLSB(exec)
-    // (Lane# = 0 if exec is zero). Ignores exec mask for the access. SQ
-    // translates to V_READLANE_B32.
-    // Input and output modifiers not supported; this is an untyped operation.
-    void
-    Inst_VOP1__V_READFIRSTLANE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarRegI32 src_lane(0);
-        ScalarRegU64 exec_mask = wf->execMask().to_ullong();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (exec_mask) {
-            src_lane = findLsbSet(exec_mask);
-        }
-
-        sdst = src[src_lane];
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_I32_F64 class methods ---
-
-    Inst_VOP1__V_CVT_I32_F64::Inst_VOP1__V_CVT_I32_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_i32_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_CVT_I32_F64
-
-    Inst_VOP1__V_CVT_I32_F64::~Inst_VOP1__V_CVT_I32_F64()
-    {
-    } // ~Inst_VOP1__V_CVT_I32_F64
-
-    // --- description from .arch file ---
-    // D.i = (int)S0.d.
-    // Out-of-range floating point values (including infinity) saturate. NaN is
-    // ---  converted to 0.
-    void
-    Inst_VOP1__V_CVT_I32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane]) || exp > 30) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = INT_MIN;
-                    } else {
-                        vdst[lane] = INT_MAX;
-                    }
-                } else {
-                    vdst[lane] = (VecElemI32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F64_I32 class methods ---
-
-    Inst_VOP1__V_CVT_F64_I32::Inst_VOP1__V_CVT_F64_I32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f64_i32")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_CVT_F64_I32
-
-    Inst_VOP1__V_CVT_F64_I32::~Inst_VOP1__V_CVT_F64_I32()
-    {
-    } // ~Inst_VOP1__V_CVT_F64_I32
-
-    // --- description from .arch file ---
-    // D.d = (double)S0.i.
-    void
-    Inst_VOP1__V_CVT_F64_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF64)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F32_I32 class methods ---
-
-    Inst_VOP1__V_CVT_F32_I32::Inst_VOP1__V_CVT_F32_I32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_i32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F32_I32
-
-    Inst_VOP1__V_CVT_F32_I32::~Inst_VOP1__V_CVT_F32_I32()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_I32
-
-    // --- description from .arch file ---
-    // D.f = (float)S0.i.
-    void
-    Inst_VOP1__V_CVT_F32_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F32_U32 class methods ---
-
-    Inst_VOP1__V_CVT_F32_U32::Inst_VOP1__V_CVT_F32_U32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_u32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F32_U32
-
-    Inst_VOP1__V_CVT_F32_U32::~Inst_VOP1__V_CVT_F32_U32()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_U32
-
-    // --- description from .arch file ---
-    // D.f = (float)S0.u.
-    void
-    Inst_VOP1__V_CVT_F32_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_U32_F32 class methods ---
-
-    Inst_VOP1__V_CVT_U32_F32::Inst_VOP1__V_CVT_U32_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_u32_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_U32_F32
-
-    Inst_VOP1__V_CVT_U32_F32::~Inst_VOP1__V_CVT_U32_F32()
-    {
-    } // ~Inst_VOP1__V_CVT_U32_F32
-
-    // --- description from .arch file ---
-    // D.u = (unsigned)S0.f.
-    // Out-of-range floating point values (including infinity) saturate. NaN is
-    // ---  converted to 0.
-    void
-    Inst_VOP1__V_CVT_U32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane])) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = 0;
-                    } else {
-                        vdst[lane] = UINT_MAX;
-                    }
-                } else if (exp > 31) {
-                    vdst[lane] = UINT_MAX;
-                } else {
-                    vdst[lane] = (VecElemU32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_I32_F32 class methods ---
-
-    Inst_VOP1__V_CVT_I32_F32::Inst_VOP1__V_CVT_I32_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_i32_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_I32_F32
-
-    Inst_VOP1__V_CVT_I32_F32::~Inst_VOP1__V_CVT_I32_F32()
-    {
-    } // ~Inst_VOP1__V_CVT_I32_F32
-
-    // --- description from .arch file ---
-    // D.i = (int)S0.f.
-    // Out-of-range floating point values (including infinity) saturate. NaN is
-    // ---  converted to 0.
-    void
-    Inst_VOP1__V_CVT_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane]) || exp > 30) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = INT_MIN;
-                    } else {
-                        vdst[lane] = INT_MAX;
-                    }
-                } else {
-                    vdst[lane] = (VecElemI32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_MOV_FED_B32 class methods ---
-
-    Inst_VOP1__V_MOV_FED_B32::Inst_VOP1__V_MOV_FED_B32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_mov_fed_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_MOV_FED_B32
-
-    Inst_VOP1__V_MOV_FED_B32::~Inst_VOP1__V_MOV_FED_B32()
-    {
-    } // ~Inst_VOP1__V_MOV_FED_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u;
-    // Introduce EDC double error upon write to dest vgpr without causing an
-    // ---  exception.
-    // Input and output modifiers not supported; this is an untyped operation.
-    void
-    Inst_VOP1__V_MOV_FED_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F16_F32 class methods ---
-
-    Inst_VOP1__V_CVT_F16_F32::Inst_VOP1__V_CVT_F16_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f16_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F16_F32
-
-    Inst_VOP1__V_CVT_F16_F32::~Inst_VOP1__V_CVT_F16_F32()
-    {
-    } // ~Inst_VOP1__V_CVT_F16_F32
-
-    // --- description from .arch file ---
-    // D.f16 = flt32_to_flt16(S0.f).
-    // Supports input modifiers and creates FP16 denormals when appropriate.
-    void
-    Inst_VOP1__V_CVT_F16_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F32_F16 class methods ---
-
-    Inst_VOP1__V_CVT_F32_F16::Inst_VOP1__V_CVT_F32_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_f16")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F32_F16
-
-    Inst_VOP1__V_CVT_F32_F16::~Inst_VOP1__V_CVT_F32_F16()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_F16
-
-    // --- description from .arch file ---
-    // D.f = flt16_to_flt32(S0.f16).
-    // FP16 denormal inputs are always accepted.
-    void
-    Inst_VOP1__V_CVT_F32_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_CVT_RPI_I32_F32 class methods ---
-
-    Inst_VOP1__V_CVT_RPI_I32_F32::Inst_VOP1__V_CVT_RPI_I32_F32(
-          InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_rpi_i32_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_RPI_I32_F32
-
-    Inst_VOP1__V_CVT_RPI_I32_F32::~Inst_VOP1__V_CVT_RPI_I32_F32()
-    {
-    } // ~Inst_VOP1__V_CVT_RPI_I32_F32
-
-    // --- description from .arch file ---
-    // D.i = (int)floor(S0.f + 0.5).
-    void
-    Inst_VOP1__V_CVT_RPI_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemI32)std::floor(src[lane] + 0.5);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_FLR_I32_F32 class methods ---
-
-    Inst_VOP1__V_CVT_FLR_I32_F32::Inst_VOP1__V_CVT_FLR_I32_F32(
-          InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_flr_i32_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_FLR_I32_F32
-
-    Inst_VOP1__V_CVT_FLR_I32_F32::~Inst_VOP1__V_CVT_FLR_I32_F32()
-    {
-    } // ~Inst_VOP1__V_CVT_FLR_I32_F32
-
-    // --- description from .arch file ---
-    // D.i = (int)floor(S0.f).
-    void
-    Inst_VOP1__V_CVT_FLR_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemI32)std::floor(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_OFF_F32_I4 class methods ---
-
-    Inst_VOP1__V_CVT_OFF_F32_I4::Inst_VOP1__V_CVT_OFF_F32_I4(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_off_f32_i4")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_OFF_F32_I4
-
-    Inst_VOP1__V_CVT_OFF_F32_I4::~Inst_VOP1__V_CVT_OFF_F32_I4()
-    {
-    } // ~Inst_VOP1__V_CVT_OFF_F32_I4
-
-    // --- description from .arch file ---
-    // 4-bit signed int to 32-bit float. Used for interpolation in shader.
-    void
-    Inst_VOP1__V_CVT_OFF_F32_I4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        // Could not parse sq_uc.arch desc field
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F32_F64 class methods ---
-
-    Inst_VOP1__V_CVT_F32_F64::Inst_VOP1__V_CVT_F32_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_CVT_F32_F64
-
-    Inst_VOP1__V_CVT_F32_F64::~Inst_VOP1__V_CVT_F32_F64()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_F64
-
-    // --- description from .arch file ---
-    // D.f = (float)S0.d.
-    void
-    Inst_VOP1__V_CVT_F32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F64_F32 class methods ---
-
-    Inst_VOP1__V_CVT_F64_F32::Inst_VOP1__V_CVT_F64_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f64_f32")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_CVT_F64_F32
-
-    Inst_VOP1__V_CVT_F64_F32::~Inst_VOP1__V_CVT_F64_F32()
-    {
-    } // ~Inst_VOP1__V_CVT_F64_F32
-
-    // --- description from .arch file ---
-    // D.d = (double)S0.f.
-    void
-    Inst_VOP1__V_CVT_F64_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF64)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F32_UBYTE0 class methods ---
-
-    Inst_VOP1__V_CVT_F32_UBYTE0::Inst_VOP1__V_CVT_F32_UBYTE0(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte0")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F32_UBYTE0
-
-    Inst_VOP1__V_CVT_F32_UBYTE0::~Inst_VOP1__V_CVT_F32_UBYTE0()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_UBYTE0
-
-    // --- description from .arch file ---
-    // D.f = (float)(S0.u[7:0]).
-    void
-    Inst_VOP1__V_CVT_F32_UBYTE0::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)(bits(src[lane], 7, 0));
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F32_UBYTE1 class methods ---
-
-    Inst_VOP1__V_CVT_F32_UBYTE1::Inst_VOP1__V_CVT_F32_UBYTE1(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte1")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F32_UBYTE1
-
-    Inst_VOP1__V_CVT_F32_UBYTE1::~Inst_VOP1__V_CVT_F32_UBYTE1()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_UBYTE1
-
-    // --- description from .arch file ---
-    // D.f = (float)(S0.u[15:8]).
-    void
-    Inst_VOP1__V_CVT_F32_UBYTE1::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)(bits(src[lane], 15, 8));
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F32_UBYTE2 class methods ---
-
-    Inst_VOP1__V_CVT_F32_UBYTE2::Inst_VOP1__V_CVT_F32_UBYTE2(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte2")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F32_UBYTE2
-
-    Inst_VOP1__V_CVT_F32_UBYTE2::~Inst_VOP1__V_CVT_F32_UBYTE2()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_UBYTE2
-
-    // --- description from .arch file ---
-    // D.f = (float)(S0.u[23:16]).
-    void
-    Inst_VOP1__V_CVT_F32_UBYTE2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)(bits(src[lane], 23, 16));
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F32_UBYTE3 class methods ---
-
-    Inst_VOP1__V_CVT_F32_UBYTE3::Inst_VOP1__V_CVT_F32_UBYTE3(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte3")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F32_UBYTE3
-
-    Inst_VOP1__V_CVT_F32_UBYTE3::~Inst_VOP1__V_CVT_F32_UBYTE3()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_UBYTE3
-
-    // --- description from .arch file ---
-    // D.f = (float)(S0.u[31:24]).
-    void
-    Inst_VOP1__V_CVT_F32_UBYTE3::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)(bits(src[lane], 31, 24));
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_U32_F64 class methods ---
-
-    Inst_VOP1__V_CVT_U32_F64::Inst_VOP1__V_CVT_U32_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_u32_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_CVT_U32_F64
-
-    Inst_VOP1__V_CVT_U32_F64::~Inst_VOP1__V_CVT_U32_F64()
-    {
-    } // ~Inst_VOP1__V_CVT_U32_F64
-
-    // --- description from .arch file ---
-    // D.u = (unsigned)S0.d.
-    // Out-of-range floating point values (including infinity) saturate. NaN is
-    // ---  converted to 0.
-    void
-    Inst_VOP1__V_CVT_U32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane])) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = 0;
-                    } else {
-                        vdst[lane] = UINT_MAX;
-                    }
-                } else if (exp > 31) {
-                    vdst[lane] = UINT_MAX;
-                } else {
-                    vdst[lane] = (VecElemU32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F64_U32 class methods ---
-
-    Inst_VOP1__V_CVT_F64_U32::Inst_VOP1__V_CVT_F64_U32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f64_u32")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_CVT_F64_U32
-
-    Inst_VOP1__V_CVT_F64_U32::~Inst_VOP1__V_CVT_F64_U32()
-    {
-    } // ~Inst_VOP1__V_CVT_F64_U32
-
-    // --- description from .arch file ---
-    // D.d = (double)S0.u.
-    void
-    Inst_VOP1__V_CVT_F64_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF64)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_TRUNC_F64 class methods ---
-
-    Inst_VOP1__V_TRUNC_F64::Inst_VOP1__V_TRUNC_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_trunc_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_TRUNC_F64
-
-    Inst_VOP1__V_TRUNC_F64::~Inst_VOP1__V_TRUNC_F64()
-    {
-    } // ~Inst_VOP1__V_TRUNC_F64
-
-    // --- description from .arch file ---
-    // D.d = trunc(S0.d), return integer part of S0.d.
-    void
-    Inst_VOP1__V_TRUNC_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::trunc(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CEIL_F64 class methods ---
-
-    Inst_VOP1__V_CEIL_F64::Inst_VOP1__V_CEIL_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_ceil_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_CEIL_F64
-
-    Inst_VOP1__V_CEIL_F64::~Inst_VOP1__V_CEIL_F64()
-    {
-    } // ~Inst_VOP1__V_CEIL_F64
-
-    // --- description from .arch file ---
-    // D.d = trunc(S0.d);
-    // if(S0.d > 0.0 && S0.d != D.d) then D.d += 1.0.
-    void
-    Inst_VOP1__V_CEIL_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::ceil(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_RNDNE_F64 class methods ---
-
-    Inst_VOP1__V_RNDNE_F64::Inst_VOP1__V_RNDNE_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rndne_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_RNDNE_F64
-
-    Inst_VOP1__V_RNDNE_F64::~Inst_VOP1__V_RNDNE_F64()
-    {
-    } // ~Inst_VOP1__V_RNDNE_F64
-
-    // --- description from .arch file ---
-    // D.d = round_nearest_even(S0.d).
-    void
-    Inst_VOP1__V_RNDNE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = roundNearestEven(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FLOOR_F64 class methods ---
-
-    Inst_VOP1__V_FLOOR_F64::Inst_VOP1__V_FLOOR_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_floor_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_FLOOR_F64
-
-    Inst_VOP1__V_FLOOR_F64::~Inst_VOP1__V_FLOOR_F64()
-    {
-    } // ~Inst_VOP1__V_FLOOR_F64
-
-    // --- description from .arch file ---
-    // D.d = trunc(S0.d);
-    // if(S0.d < 0.0 && S0.d != D.d) then D.d += -1.0.
-    void
-    Inst_VOP1__V_FLOOR_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::floor(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FRACT_F32 class methods ---
-
-    Inst_VOP1__V_FRACT_F32::Inst_VOP1__V_FRACT_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_fract_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_FRACT_F32
-
-    Inst_VOP1__V_FRACT_F32::~Inst_VOP1__V_FRACT_F32()
-    {
-    } // ~Inst_VOP1__V_FRACT_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f - floor(S0.f).
-    void
-    Inst_VOP1__V_FRACT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemF32 int_part(0.0);
-                vdst[lane] = std::modf(src[lane], &int_part);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_TRUNC_F32 class methods ---
-
-    Inst_VOP1__V_TRUNC_F32::Inst_VOP1__V_TRUNC_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_trunc_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_TRUNC_F32
-
-    Inst_VOP1__V_TRUNC_F32::~Inst_VOP1__V_TRUNC_F32()
-    {
-    } // ~Inst_VOP1__V_TRUNC_F32
-
-    // --- description from .arch file ---
-    // D.f = trunc(S0.f), return integer part of S0.f.
-    void
-    Inst_VOP1__V_TRUNC_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst (gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::trunc(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CEIL_F32 class methods ---
-
-    Inst_VOP1__V_CEIL_F32::Inst_VOP1__V_CEIL_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_ceil_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CEIL_F32
-
-    Inst_VOP1__V_CEIL_F32::~Inst_VOP1__V_CEIL_F32()
-    {
-    } // ~Inst_VOP1__V_CEIL_F32
-
-    // --- description from .arch file ---
-    // D.f = trunc(S0.f);
-    // if(S0.f > 0.0 && S0.f != D.f) then D.f += 1.0.
-    void
-    Inst_VOP1__V_CEIL_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::ceil(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_RNDNE_F32 class methods ---
-
-    Inst_VOP1__V_RNDNE_F32::Inst_VOP1__V_RNDNE_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rndne_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_RNDNE_F32
-
-    Inst_VOP1__V_RNDNE_F32::~Inst_VOP1__V_RNDNE_F32()
-    {
-    } // ~Inst_VOP1__V_RNDNE_F32
-
-    // --- description from .arch file ---
-    // D.f = round_nearest_even(S0.f).
-    void
-    Inst_VOP1__V_RNDNE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = roundNearestEven(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FLOOR_F32 class methods ---
-
-    Inst_VOP1__V_FLOOR_F32::Inst_VOP1__V_FLOOR_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_floor_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_FLOOR_F32
-
-    Inst_VOP1__V_FLOOR_F32::~Inst_VOP1__V_FLOOR_F32()
-    {
-    } // ~Inst_VOP1__V_FLOOR_F32
-
-    // --- description from .arch file ---
-    // D.f = trunc(S0.f);
-    // if(S0.f < 0.0 && S0.f != D.f) then D.f += -1.0.
-    void
-    Inst_VOP1__V_FLOOR_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::floor(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_EXP_F32 class methods ---
-
-    Inst_VOP1__V_EXP_F32::Inst_VOP1__V_EXP_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_exp_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_EXP_F32
-
-    Inst_VOP1__V_EXP_F32::~Inst_VOP1__V_EXP_F32()
-    {
-    } // ~Inst_VOP1__V_EXP_F32
-
-    // --- description from .arch file ---
-    // D.f = pow(2.0, S0.f).
-    void
-    Inst_VOP1__V_EXP_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::pow(2.0, src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_LOG_F32 class methods ---
-
-    Inst_VOP1__V_LOG_F32::Inst_VOP1__V_LOG_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_log_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_LOG_F32
-
-    Inst_VOP1__V_LOG_F32::~Inst_VOP1__V_LOG_F32()
-    {
-    } // ~Inst_VOP1__V_LOG_F32
-
-    // --- description from .arch file ---
-    // D.f = log2(S0.f). Base 2 logarithm.
-    void
-    Inst_VOP1__V_LOG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::log2(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_RCP_F32 class methods ---
-
-    Inst_VOP1__V_RCP_F32::Inst_VOP1__V_RCP_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rcp_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_RCP_F32
-
-    Inst_VOP1__V_RCP_F32::~Inst_VOP1__V_RCP_F32()
-    {
-    } // ~Inst_VOP1__V_RCP_F32
-
-    // --- description from .arch file ---
-    // D.f = 1.0 / S0.f. Reciprocal with IEEE rules and < 1ulp error.
-    void
-    Inst_VOP1__V_RCP_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = 1.0 / src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_RCP_IFLAG_F32 class methods ---
-
-    Inst_VOP1__V_RCP_IFLAG_F32::Inst_VOP1__V_RCP_IFLAG_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rcp_iflag_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_RCP_IFLAG_F32
-
-    Inst_VOP1__V_RCP_IFLAG_F32::~Inst_VOP1__V_RCP_IFLAG_F32()
-    {
-    } // ~Inst_VOP1__V_RCP_IFLAG_F32
-
-    // --- description from .arch file ---
-    // D.f = 1.0 / S0.f. Reciprocal intended for integer division, can raise
-    // ---  integer DIV_BY_ZERO exception but cannot raise floating-point
-    // ---  exceptions.
-    void
-    Inst_VOP1__V_RCP_IFLAG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = 1.0 / src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_RSQ_F32 class methods ---
-
-    Inst_VOP1__V_RSQ_F32::Inst_VOP1__V_RSQ_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rsq_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_RSQ_F32
-
-    Inst_VOP1__V_RSQ_F32::~Inst_VOP1__V_RSQ_F32()
-    {
-    } // ~Inst_VOP1__V_RSQ_F32
-
-    // --- description from .arch file ---
-    // D.f = 1.0 / sqrt(S0.f). Reciprocal square root with IEEE rules.
-    void
-    Inst_VOP1__V_RSQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = 1.0 / std::sqrt(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_RCP_F64 class methods ---
-
-    Inst_VOP1__V_RCP_F64::Inst_VOP1__V_RCP_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rcp_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_RCP_F64
-
-    Inst_VOP1__V_RCP_F64::~Inst_VOP1__V_RCP_F64()
-    {
-    } // ~Inst_VOP1__V_RCP_F64
-
-    // --- description from .arch file ---
-    // D.d = 1.0 / S0.d.
-    void
-    Inst_VOP1__V_RCP_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::fpclassify(src[lane]) == FP_ZERO) {
-                    vdst[lane] = +INFINITY;
-                } else if (std::isnan(src[lane])) {
-                    vdst[lane] = NAN;
-                } else if (std::isinf(src[lane])) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = -0.0;
-                    } else {
-                        vdst[lane] = 0.0;
-                    }
-                } else {
-                    vdst[lane] = 1.0 / src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_RSQ_F64 class methods ---
-
-    Inst_VOP1__V_RSQ_F64::Inst_VOP1__V_RSQ_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rsq_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_RSQ_F64
-
-    Inst_VOP1__V_RSQ_F64::~Inst_VOP1__V_RSQ_F64()
-    {
-    } // ~Inst_VOP1__V_RSQ_F64
-
-    // --- description from .arch file ---
-    // D.d = 1.0 / sqrt(S0.d). See V_RSQ_F32.
-    void
-    Inst_VOP1__V_RSQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::fpclassify(src[lane]) == FP_ZERO) {
-                    vdst[lane] = +INFINITY;
-                } else if (std::isnan(src[lane])) {
-                    vdst[lane] = NAN;
-                } else if (std::isinf(src[lane])
-                           && !std::signbit(src[lane])) {
-                    vdst[lane] = 0.0;
-                } else if (std::signbit(src[lane])) {
-                    vdst[lane] = NAN;
-                } else {
-                    vdst[lane] = 1.0 / std::sqrt(src[lane]);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_SQRT_F32 class methods ---
-
-    Inst_VOP1__V_SQRT_F32::Inst_VOP1__V_SQRT_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_sqrt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_SQRT_F32
-
-    Inst_VOP1__V_SQRT_F32::~Inst_VOP1__V_SQRT_F32()
-    {
-    } // ~Inst_VOP1__V_SQRT_F32
-
-    // --- description from .arch file ---
-    // D.f = sqrt(S0.f).
-    void
-    Inst_VOP1__V_SQRT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::sqrt(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_SQRT_F64 class methods ---
-
-    Inst_VOP1__V_SQRT_F64::Inst_VOP1__V_SQRT_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_sqrt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_SQRT_F64
-
-    Inst_VOP1__V_SQRT_F64::~Inst_VOP1__V_SQRT_F64()
-    {
-    } // ~Inst_VOP1__V_SQRT_F64
-
-    // --- description from .arch file ---
-    // D.d = sqrt(S0.d).
-    void
-    Inst_VOP1__V_SQRT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::sqrt(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_SIN_F32 class methods ---
-
-    Inst_VOP1__V_SIN_F32::Inst_VOP1__V_SIN_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_sin_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_SIN_F32
-
-    Inst_VOP1__V_SIN_F32::~Inst_VOP1__V_SIN_F32()
-    {
-    } // ~Inst_VOP1__V_SIN_F32
-
-    // --- description from .arch file ---
-    // D.f = sin(S0.f * 2 * PI).
-    // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
-    // float 0.0.
-    void
-    Inst_VOP1__V_SIN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-        pi.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (src[lane] < -256.0 || src[lane] > 256.0) {
-                    vdst[lane] = 0.0;
-                } else {
-                    vdst[lane] = std::sin(src[lane] * 2.0 * pi.rawData());
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_COS_F32 class methods ---
-
-    Inst_VOP1__V_COS_F32::Inst_VOP1__V_COS_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cos_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_COS_F32
-
-    Inst_VOP1__V_COS_F32::~Inst_VOP1__V_COS_F32()
-    {
-    } // ~Inst_VOP1__V_COS_F32
-
-    // --- description from .arch file ---
-    // D.f = cos(S0.f * 2 * PI).
-    // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
-    // float 1.0.
-    void
-    Inst_VOP1__V_COS_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-        pi.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (src[lane] < -256.0 || src[lane] > 256.0) {
-                    vdst[lane] = 0.0;
-                } else {
-                    vdst[lane] = std::cos(src[lane] * 2.0 * pi.rawData());
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_NOT_B32 class methods ---
-
-    Inst_VOP1__V_NOT_B32::Inst_VOP1__V_NOT_B32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_not_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_NOT_B32
-
-    Inst_VOP1__V_NOT_B32::~Inst_VOP1__V_NOT_B32()
-    {
-    } // ~Inst_VOP1__V_NOT_B32
-
-    // --- description from .arch file ---
-    // D.u = ~S0.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP1__V_NOT_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = ~src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_BFREV_B32 class methods ---
-
-    Inst_VOP1__V_BFREV_B32::Inst_VOP1__V_BFREV_B32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_bfrev_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_BFREV_B32
-
-    Inst_VOP1__V_BFREV_B32::~Inst_VOP1__V_BFREV_B32()
-    {
-    } // ~Inst_VOP1__V_BFREV_B32
-
-    // --- description from .arch file ---
-    // D.u[31:0] = S0.u[0:31], bitfield reverse.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP1__V_BFREV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = reverseBits(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FFBH_U32 class methods ---
-
-    Inst_VOP1__V_FFBH_U32::Inst_VOP1__V_FFBH_U32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_ffbh_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_FFBH_U32
-
-    Inst_VOP1__V_FFBH_U32::~Inst_VOP1__V_FFBH_U32()
-    {
-    } // ~Inst_VOP1__V_FFBH_U32
-
-    // --- description from .arch file ---
-    // D.u = position of first 1 in S0.u from MSB;
-    // D.u = 0xffffffff if S0.u == 0.
-    void
-    Inst_VOP1__V_FFBH_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = findFirstOneMsb(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FFBL_B32 class methods ---
-
-    Inst_VOP1__V_FFBL_B32::Inst_VOP1__V_FFBL_B32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_ffbl_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_FFBL_B32
-
-    Inst_VOP1__V_FFBL_B32::~Inst_VOP1__V_FFBL_B32()
-    {
-    } // ~Inst_VOP1__V_FFBL_B32
-
-    // --- description from .arch file ---
-    // D.u = position of first 1 in S0.u from LSB;
-    // D.u = 0xffffffff if S0.u == 0.
-    void
-    Inst_VOP1__V_FFBL_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = findFirstOne(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FFBH_I32 class methods ---
-
-    Inst_VOP1__V_FFBH_I32::Inst_VOP1__V_FFBH_I32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_ffbh_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_FFBH_I32
-
-    Inst_VOP1__V_FFBH_I32::~Inst_VOP1__V_FFBH_I32()
-    {
-    } // ~Inst_VOP1__V_FFBH_I32
-
-    // --- description from .arch file ---
-    // D.u = position of first bit different from sign bit in S0.i from MSB;
-    // D.u = 0xffffffff if S0.i == 0 or S0.i == 0xffffffff.
-    void
-    Inst_VOP1__V_FFBH_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = firstOppositeSignBit(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FREXP_EXP_I32_F64 class methods ---
-
-    Inst_VOP1__V_FREXP_EXP_I32_F64::Inst_VOP1__V_FREXP_EXP_I32_F64(
-          InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_frexp_exp_i32_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_FREXP_EXP_I32_F64
-
-    Inst_VOP1__V_FREXP_EXP_I32_F64::~Inst_VOP1__V_FREXP_EXP_I32_F64()
-    {
-    } // ~Inst_VOP1__V_FREXP_EXP_I32_F64
-
-    // --- description from .arch file ---
-    // See V_FREXP_EXP_I32_F32.
-    void
-    Inst_VOP1__V_FREXP_EXP_I32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else {
-                    VecElemI32 exp = 0;
-                    std::frexp(src[lane], &exp);
-                    vdst[lane] = exp;
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FREXP_MANT_F64 class methods ---
-
-    Inst_VOP1__V_FREXP_MANT_F64::Inst_VOP1__V_FREXP_MANT_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_frexp_mant_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_FREXP_MANT_F64
-
-    Inst_VOP1__V_FREXP_MANT_F64::~Inst_VOP1__V_FREXP_MANT_F64()
-    {
-    } // ~Inst_VOP1__V_FREXP_MANT_F64
-
-    // --- description from .arch file ---
-    // See V_FREXP_MANT_F32.
-    void
-    Inst_VOP1__V_FREXP_MANT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
-                    vdst[lane] = src[lane];
-                } else {
-                    VecElemI32 exp(0);
-                    vdst[lane] = std::frexp(src[lane], &exp);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FRACT_F64 class methods ---
-
-    Inst_VOP1__V_FRACT_F64::Inst_VOP1__V_FRACT_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_fract_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_FRACT_F64
-
-    Inst_VOP1__V_FRACT_F64::~Inst_VOP1__V_FRACT_F64()
-    {
-    } // ~Inst_VOP1__V_FRACT_F64
-
-    // --- description from .arch file ---
-    // See V_FRACT_F32.
-    void
-    Inst_VOP1__V_FRACT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemF64 int_part(0.0);
-                vdst[lane] = std::modf(src[lane], &int_part);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FREXP_EXP_I32_F32 class methods ---
-
-    Inst_VOP1__V_FREXP_EXP_I32_F32::Inst_VOP1__V_FREXP_EXP_I32_F32(
-          InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_frexp_exp_i32_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_FREXP_EXP_I32_F32
-
-    Inst_VOP1__V_FREXP_EXP_I32_F32::~Inst_VOP1__V_FREXP_EXP_I32_F32()
-    {
-    } // ~Inst_VOP1__V_FREXP_EXP_I32_F32
-
-    // --- description from .arch file ---
-    // if(S0.f == INF || S0.f == NAN) then D.i = 0;
-    // else D.i = TwosComplement(Exponent(S0.f) - 127 + 1).
-    // Returns exponent of single precision float input, such that S0.f =
-    // significand * (2 ** exponent). See also FREXP_MANT_F32, which returns
-    // the significand.
-    void
-    Inst_VOP1__V_FREXP_EXP_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else {
-                    VecElemI32 exp(0);
-                    std::frexp(src[lane], &exp);
-                    vdst[lane] = exp;
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FREXP_MANT_F32 class methods ---
-
-    Inst_VOP1__V_FREXP_MANT_F32::Inst_VOP1__V_FREXP_MANT_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_frexp_mant_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_FREXP_MANT_F32
-
-    Inst_VOP1__V_FREXP_MANT_F32::~Inst_VOP1__V_FREXP_MANT_F32()
-    {
-    } // ~Inst_VOP1__V_FREXP_MANT_F32
-
-    // --- description from .arch file ---
-    // if(S0.f == INF || S0.f == NAN) then D.f = S0.f;
-    // else D.f = Mantissa(S0.f).
-    // Result range is in (-1.0,-0.5][0.5,1.0) in normal cases. Returns binary
-    // ---  significand of single precision float input, such that S0.f =
-    // ---  significand * (2 ** exponent). See also FREXP_EXP_I32_F32, which
-    // ---  returns integer exponent.
-    void
-    Inst_VOP1__V_FREXP_MANT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
-                    vdst[lane] = src[lane];
-                } else {
-                    VecElemI32 exp(0);
-                    vdst[lane] = std::frexp(src[lane], &exp);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CLREXCP class methods ---
-
-    Inst_VOP1__V_CLREXCP::Inst_VOP1__V_CLREXCP(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_clrexcp")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_CLREXCP
-
-    Inst_VOP1__V_CLREXCP::~Inst_VOP1__V_CLREXCP()
-    {
-    } // ~Inst_VOP1__V_CLREXCP
-
-    // --- description from .arch file ---
-    // Clear wave's exception state in SIMD (SP).
-    void
-    Inst_VOP1__V_CLREXCP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F16_U16 class methods ---
-
-    Inst_VOP1__V_CVT_F16_U16::Inst_VOP1__V_CVT_F16_U16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f16_u16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_CVT_F16_U16
-
-    Inst_VOP1__V_CVT_F16_U16::~Inst_VOP1__V_CVT_F16_U16()
-    {
-    } // ~Inst_VOP1__V_CVT_F16_U16
-
-    // --- description from .arch file ---
-    // D.f16 = uint16_to_flt16(S.u16).
-    // Supports denormals, rounding, exception flags and saturation.
-    void
-    Inst_VOP1__V_CVT_F16_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F16_I16 class methods ---
-
-    Inst_VOP1__V_CVT_F16_I16::Inst_VOP1__V_CVT_F16_I16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f16_i16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_CVT_F16_I16
-
-    Inst_VOP1__V_CVT_F16_I16::~Inst_VOP1__V_CVT_F16_I16()
-    {
-    } // ~Inst_VOP1__V_CVT_F16_I16
-
-    // --- description from .arch file ---
-    // D.f16 = int16_to_flt16(S.i16).
-    // Supports denormals, rounding, exception flags and saturation.
-    void
-    Inst_VOP1__V_CVT_F16_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_CVT_U16_F16 class methods ---
-
-    Inst_VOP1__V_CVT_U16_F16::Inst_VOP1__V_CVT_U16_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_u16_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_CVT_U16_F16
-
-    Inst_VOP1__V_CVT_U16_F16::~Inst_VOP1__V_CVT_U16_F16()
-    {
-    } // ~Inst_VOP1__V_CVT_U16_F16
-
-    // --- description from .arch file ---
-    // D.u16 = flt16_to_uint16(S.f16).
-    // Supports rounding, exception flags and saturation.
-    void
-    Inst_VOP1__V_CVT_U16_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_CVT_I16_F16 class methods ---
-
-    Inst_VOP1__V_CVT_I16_F16::Inst_VOP1__V_CVT_I16_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_i16_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_CVT_I16_F16
-
-    Inst_VOP1__V_CVT_I16_F16::~Inst_VOP1__V_CVT_I16_F16()
-    {
-    } // ~Inst_VOP1__V_CVT_I16_F16
-
-    // --- description from .arch file ---
-    // D.i16 = flt16_to_int16(S.f16).
-    // Supports rounding, exception flags and saturation.
-    void
-    Inst_VOP1__V_CVT_I16_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_RCP_F16 class methods ---
-
-    Inst_VOP1__V_RCP_F16::Inst_VOP1__V_RCP_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rcp_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_RCP_F16
-
-    Inst_VOP1__V_RCP_F16::~Inst_VOP1__V_RCP_F16()
-    {
-    } // ~Inst_VOP1__V_RCP_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 1.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = ApproximateRecip(S0.f16).
-    void
-    Inst_VOP1__V_RCP_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_SQRT_F16 class methods ---
-
-    Inst_VOP1__V_SQRT_F16::Inst_VOP1__V_SQRT_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_sqrt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_SQRT_F16
-
-    Inst_VOP1__V_SQRT_F16::~Inst_VOP1__V_SQRT_F16()
-    {
-    } // ~Inst_VOP1__V_SQRT_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 1.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = ApproximateSqrt(S0.f16).
-    void
-    Inst_VOP1__V_SQRT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_RSQ_F16 class methods ---
-
-    Inst_VOP1__V_RSQ_F16::Inst_VOP1__V_RSQ_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rsq_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_RSQ_F16
-
-    Inst_VOP1__V_RSQ_F16::~Inst_VOP1__V_RSQ_F16()
-    {
-    } // ~Inst_VOP1__V_RSQ_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 1.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = ApproximateRecipSqrt(S0.f16).
-    void
-    Inst_VOP1__V_RSQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_LOG_F16 class methods ---
-
-    Inst_VOP1__V_LOG_F16::Inst_VOP1__V_LOG_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_log_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_LOG_F16
-
-    Inst_VOP1__V_LOG_F16::~Inst_VOP1__V_LOG_F16()
-    {
-    } // ~Inst_VOP1__V_LOG_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 1.0f)
-    //     D.f16 = 0.0f;
-    // else
-    //     D.f16 = ApproximateLog2(S0.f16).
-    void
-    Inst_VOP1__V_LOG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_EXP_F16 class methods ---
-
-    Inst_VOP1__V_EXP_F16::Inst_VOP1__V_EXP_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_exp_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_EXP_F16
-
-    Inst_VOP1__V_EXP_F16::~Inst_VOP1__V_EXP_F16()
-    {
-    } // ~Inst_VOP1__V_EXP_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 0.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = Approximate2ToX(S0.f16).
-    void
-    Inst_VOP1__V_EXP_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_FREXP_MANT_F16 class methods ---
-
-    Inst_VOP1__V_FREXP_MANT_F16::Inst_VOP1__V_FREXP_MANT_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_frexp_mant_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_FREXP_MANT_F16
-
-    Inst_VOP1__V_FREXP_MANT_F16::~Inst_VOP1__V_FREXP_MANT_F16()
-    {
-    } // ~Inst_VOP1__V_FREXP_MANT_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == +-INF || S0.f16 == NAN)
-    //     D.f16 = S0.f16;
-    // else
-    //     D.f16 = mantissa(S0.f16).
-    // Result range is (-1.0,-0.5][0.5,1.0).
-    // C math library frexp function.
-    // Returns binary significand of half precision float input, such that the
-    // original single float = significand * (2 ** exponent).
-    void
-    Inst_VOP1__V_FREXP_MANT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_FREXP_EXP_I16_F16 class methods ---
-
-    Inst_VOP1__V_FREXP_EXP_I16_F16::Inst_VOP1__V_FREXP_EXP_I16_F16(
-          InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_frexp_exp_i16_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_FREXP_EXP_I16_F16
-
-    Inst_VOP1__V_FREXP_EXP_I16_F16::~Inst_VOP1__V_FREXP_EXP_I16_F16()
-    {
-    } // ~Inst_VOP1__V_FREXP_EXP_I16_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == +-INF || S0.f16 == NAN)
-    //     D.i16 = 0;
-    // else
-    //     D.i16 = 2s_complement(exponent(S0.f16) - 15 + 1).
-    // C math library frexp function.
-    // Returns exponent of half precision float input, such that the
-    // original single float = significand * (2 ** exponent).
-    void
-    Inst_VOP1__V_FREXP_EXP_I16_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_FLOOR_F16 class methods ---
-
-    Inst_VOP1__V_FLOOR_F16::Inst_VOP1__V_FLOOR_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_floor_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_FLOOR_F16
-
-    Inst_VOP1__V_FLOOR_F16::~Inst_VOP1__V_FLOOR_F16()
-    {
-    } // ~Inst_VOP1__V_FLOOR_F16
-
-    // --- description from .arch file ---
-    // D.f16 = trunc(S0.f16);
-    // if(S0.f16 < 0.0f && S0.f16 != D.f16) then D.f16 -= 1.0f.
-    void
-    Inst_VOP1__V_FLOOR_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_CEIL_F16 class methods ---
-
-    Inst_VOP1__V_CEIL_F16::Inst_VOP1__V_CEIL_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_ceil_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_CEIL_F16
-
-    Inst_VOP1__V_CEIL_F16::~Inst_VOP1__V_CEIL_F16()
-    {
-    } // ~Inst_VOP1__V_CEIL_F16
-
-    // --- description from .arch file ---
-    // D.f16 = trunc(S0.f16);
-    // if(S0.f16 > 0.0f && S0.f16 != D.f16) then D.f16 += 1.0f.
-    void
-    Inst_VOP1__V_CEIL_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_TRUNC_F16 class methods ---
-
-    Inst_VOP1__V_TRUNC_F16::Inst_VOP1__V_TRUNC_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_trunc_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_TRUNC_F16
-
-    Inst_VOP1__V_TRUNC_F16::~Inst_VOP1__V_TRUNC_F16()
-    {
-    } // ~Inst_VOP1__V_TRUNC_F16
-
-    // --- description from .arch file ---
-    // D.f16 = trunc(S0.f16).
-    // Round-to-zero semantics.
-    void
-    Inst_VOP1__V_TRUNC_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_RNDNE_F16 class methods ---
-
-    Inst_VOP1__V_RNDNE_F16::Inst_VOP1__V_RNDNE_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rndne_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_RNDNE_F16
-
-    Inst_VOP1__V_RNDNE_F16::~Inst_VOP1__V_RNDNE_F16()
-    {
-    } // ~Inst_VOP1__V_RNDNE_F16
-
-    // --- description from .arch file ---
-    // D.f16 = FLOOR(S0.f16 + 0.5f);
-    // if(floor(S0.f16) is even && fract(S0.f16) == 0.5f) then D.f16 -= 1.0f.
-    // Round-to-nearest-even semantics.
-    void
-    Inst_VOP1__V_RNDNE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_FRACT_F16 class methods ---
-
-    Inst_VOP1__V_FRACT_F16::Inst_VOP1__V_FRACT_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_fract_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_FRACT_F16
-
-    Inst_VOP1__V_FRACT_F16::~Inst_VOP1__V_FRACT_F16()
-    {
-    } // ~Inst_VOP1__V_FRACT_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 + -floor(S0.f16).
-    void
-    Inst_VOP1__V_FRACT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_SIN_F16 class methods ---
-
-    Inst_VOP1__V_SIN_F16::Inst_VOP1__V_SIN_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_sin_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_SIN_F16
-
-    Inst_VOP1__V_SIN_F16::~Inst_VOP1__V_SIN_F16()
-    {
-    } // ~Inst_VOP1__V_SIN_F16
-
-    // --- description from .arch file ---
-    // D.f16 = sin(S0.f16 * 2 * PI).
-    void
-    Inst_VOP1__V_SIN_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_COS_F16 class methods ---
-
-    Inst_VOP1__V_COS_F16::Inst_VOP1__V_COS_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cos_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_COS_F16
-
-    Inst_VOP1__V_COS_F16::~Inst_VOP1__V_COS_F16()
-    {
-    } // ~Inst_VOP1__V_COS_F16
-
-    // --- description from .arch file ---
-    // D.f16 = cos(S0.f16 * 2 * PI).
-    void
-    Inst_VOP1__V_COS_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_EXP_LEGACY_F32 class methods ---
-
-    Inst_VOP1__V_EXP_LEGACY_F32::Inst_VOP1__V_EXP_LEGACY_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_exp_legacy_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_EXP_LEGACY_F32
-
-    Inst_VOP1__V_EXP_LEGACY_F32::~Inst_VOP1__V_EXP_LEGACY_F32()
-    {
-    } // ~Inst_VOP1__V_EXP_LEGACY_F32
-
-    // --- description from .arch file ---
-    // D.f = pow(2.0, S0.f) with legacy semantics.
-    void
-    Inst_VOP1__V_EXP_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::pow(2.0, src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_LOG_LEGACY_F32 class methods ---
-
-    Inst_VOP1__V_LOG_LEGACY_F32::Inst_VOP1__V_LOG_LEGACY_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_log_legacy_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_LOG_LEGACY_F32
-
-    Inst_VOP1__V_LOG_LEGACY_F32::~Inst_VOP1__V_LOG_LEGACY_F32()
-    {
-    } // ~Inst_VOP1__V_LOG_LEGACY_F32
-
-    // --- description from .arch file ---
-    // D.f = log2(S0.f). Base 2 logarithm with legacy semantics.
-    void
-    Inst_VOP1__V_LOG_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::log2(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_CLASS_F32 class methods ---
-
-    Inst_VOPC__V_CMP_CLASS_F32::Inst_VOPC__V_CMP_CLASS_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_class_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_CLASS_F32
-
-    Inst_VOPC__V_CMP_CLASS_F32::~Inst_VOPC__V_CMP_CLASS_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_CLASS_F32
-
-    // --- description from .arch file ---
-    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f
-    // The function reports true if the floating point value is *any* of the
-    // ---  numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOPC__V_CMP_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane]) && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_CLASS_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_CLASS_F32::Inst_VOPC__V_CMPX_CLASS_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_class_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_CLASS_F32
-
-    Inst_VOPC__V_CMPX_CLASS_F32::~Inst_VOPC__V_CMPX_CLASS_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_CLASS_F32
-
-    // --- description from .arch file ---
-    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
-    // S0.f The function reports true if the floating point value is *any* of
-    // the numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOPC__V_CMPX_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane]) && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMP_CLASS_F64 class methods ---
-
-    Inst_VOPC__V_CMP_CLASS_F64::Inst_VOPC__V_CMP_CLASS_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_class_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_CLASS_F64
-
-    Inst_VOPC__V_CMP_CLASS_F64::~Inst_VOPC__V_CMP_CLASS_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_CLASS_F64
-
-    // --- description from .arch file ---
-    // VCC = IEEE numeric class function specified in S1.u, performed on S0.d
-    // The function reports true if the floating point value is *any* of the
-    // ---  numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOPC__V_CMP_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_CLASS_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_CLASS_F64::Inst_VOPC__V_CMPX_CLASS_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_class_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_CLASS_F64
-
-    Inst_VOPC__V_CMPX_CLASS_F64::~Inst_VOPC__V_CMPX_CLASS_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_CLASS_F64
-
-    // --- description from .arch file ---
-    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
-    // S0.d The function reports true if the floating point value is *any* of
-    // the numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOPC__V_CMPX_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMP_CLASS_F16 class methods ---
-
-    Inst_VOPC__V_CMP_CLASS_F16::Inst_VOPC__V_CMP_CLASS_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_class_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_CLASS_F16
-
-    Inst_VOPC__V_CMP_CLASS_F16::~Inst_VOPC__V_CMP_CLASS_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_CLASS_F16
-
-    // --- description from .arch file ---
-    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f16
-    // The function reports true if the floating point value is *any* of the
-    // ---  numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOPC__V_CMP_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_CLASS_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_CLASS_F16::Inst_VOPC__V_CMPX_CLASS_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_class_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_CLASS_F16
-
-    Inst_VOPC__V_CMPX_CLASS_F16::~Inst_VOPC__V_CMPX_CLASS_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_CLASS_F16
-
-    // --- description from .arch file ---
-    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
-    // ---  S0.f16
-    // The function reports true if the floating point value is *any* of the
-    // ---  numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOPC__V_CMPX_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_F_F16 class methods ---
-
-    Inst_VOPC__V_CMP_F_F16::Inst_VOPC__V_CMP_F_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_F_F16
-
-    Inst_VOPC__V_CMP_F_F16::~Inst_VOPC__V_CMP_F_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_F_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LT_F16 class methods ---
-
-    Inst_VOPC__V_CMP_LT_F16::Inst_VOPC__V_CMP_LT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_LT_F16
-
-    Inst_VOPC__V_CMP_LT_F16::~Inst_VOPC__V_CMP_LT_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_EQ_F16 class methods ---
-
-    Inst_VOPC__V_CMP_EQ_F16::Inst_VOPC__V_CMP_EQ_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_EQ_F16
-
-    Inst_VOPC__V_CMP_EQ_F16::~Inst_VOPC__V_CMP_EQ_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LE_F16 class methods ---
-
-    Inst_VOPC__V_CMP_LE_F16::Inst_VOPC__V_CMP_LE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_LE_F16
-
-    Inst_VOPC__V_CMP_LE_F16::~Inst_VOPC__V_CMP_LE_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GT_F16 class methods ---
-
-    Inst_VOPC__V_CMP_GT_F16::Inst_VOPC__V_CMP_GT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_GT_F16
-
-    Inst_VOPC__V_CMP_GT_F16::~Inst_VOPC__V_CMP_GT_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LG_F16 class methods ---
-
-    Inst_VOPC__V_CMP_LG_F16::Inst_VOPC__V_CMP_LG_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lg_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_LG_F16
-
-    Inst_VOPC__V_CMP_LG_F16::~Inst_VOPC__V_CMP_LG_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_LG_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GE_F16 class methods ---
-
-    Inst_VOPC__V_CMP_GE_F16::Inst_VOPC__V_CMP_GE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_GE_F16
-
-    Inst_VOPC__V_CMP_GE_F16::~Inst_VOPC__V_CMP_GE_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_O_F16 class methods ---
-
-    Inst_VOPC__V_CMP_O_F16::Inst_VOPC__V_CMP_O_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_o_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_O_F16
-
-    Inst_VOPC__V_CMP_O_F16::~Inst_VOPC__V_CMP_O_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_O_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_O_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_U_F16 class methods ---
-
-    Inst_VOPC__V_CMP_U_F16::Inst_VOPC__V_CMP_U_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_u_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_U_F16
-
-    Inst_VOPC__V_CMP_U_F16::~Inst_VOPC__V_CMP_U_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_U_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_U_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NGE_F16 class methods ---
-
-    Inst_VOPC__V_CMP_NGE_F16::Inst_VOPC__V_CMP_NGE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nge_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_NGE_F16
-
-    Inst_VOPC__V_CMP_NGE_F16::~Inst_VOPC__V_CMP_NGE_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_NGE_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NLG_F16 class methods ---
-
-    Inst_VOPC__V_CMP_NLG_F16::Inst_VOPC__V_CMP_NLG_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nlg_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_NLG_F16
-
-    Inst_VOPC__V_CMP_NLG_F16::~Inst_VOPC__V_CMP_NLG_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_NLG_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NGT_F16 class methods ---
-
-    Inst_VOPC__V_CMP_NGT_F16::Inst_VOPC__V_CMP_NGT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ngt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_NGT_F16
-
-    Inst_VOPC__V_CMP_NGT_F16::~Inst_VOPC__V_CMP_NGT_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_NGT_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NLE_F16 class methods ---
-
-    Inst_VOPC__V_CMP_NLE_F16::Inst_VOPC__V_CMP_NLE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nle_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_NLE_F16
-
-    Inst_VOPC__V_CMP_NLE_F16::~Inst_VOPC__V_CMP_NLE_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_NLE_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NEQ_F16 class methods ---
-
-    Inst_VOPC__V_CMP_NEQ_F16::Inst_VOPC__V_CMP_NEQ_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_neq_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_NEQ_F16
-
-    Inst_VOPC__V_CMP_NEQ_F16::~Inst_VOPC__V_CMP_NEQ_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_NEQ_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NLT_F16 class methods ---
-
-    Inst_VOPC__V_CMP_NLT_F16::Inst_VOPC__V_CMP_NLT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nlt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_NLT_F16
-
-    Inst_VOPC__V_CMP_NLT_F16::~Inst_VOPC__V_CMP_NLT_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_NLT_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_TRU_F16 class methods ---
-
-    Inst_VOPC__V_CMP_TRU_F16::Inst_VOPC__V_CMP_TRU_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_tru_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_TRU_F16
-
-    Inst_VOPC__V_CMP_TRU_F16::~Inst_VOPC__V_CMP_TRU_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_TRU_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_F_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_F_F16::Inst_VOPC__V_CMPX_F_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_F_F16
-
-    Inst_VOPC__V_CMPX_F_F16::~Inst_VOPC__V_CMPX_F_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LT_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_LT_F16::Inst_VOPC__V_CMPX_LT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LT_F16
-
-    Inst_VOPC__V_CMPX_LT_F16::~Inst_VOPC__V_CMPX_LT_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_EQ_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_EQ_F16::Inst_VOPC__V_CMPX_EQ_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_EQ_F16
-
-    Inst_VOPC__V_CMPX_EQ_F16::~Inst_VOPC__V_CMPX_EQ_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LE_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_LE_F16::Inst_VOPC__V_CMPX_LE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LE_F16
-
-    Inst_VOPC__V_CMPX_LE_F16::~Inst_VOPC__V_CMPX_LE_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GT_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_GT_F16::Inst_VOPC__V_CMPX_GT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GT_F16
-
-    Inst_VOPC__V_CMPX_GT_F16::~Inst_VOPC__V_CMPX_GT_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LG_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_LG_F16::Inst_VOPC__V_CMPX_LG_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lg_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LG_F16
-
-    Inst_VOPC__V_CMPX_LG_F16::~Inst_VOPC__V_CMPX_LG_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_LG_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GE_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_GE_F16::Inst_VOPC__V_CMPX_GE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GE_F16
-
-    Inst_VOPC__V_CMPX_GE_F16::~Inst_VOPC__V_CMPX_GE_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_O_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_O_F16::Inst_VOPC__V_CMPX_O_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_o_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_O_F16
-
-    Inst_VOPC__V_CMPX_O_F16::~Inst_VOPC__V_CMPX_O_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_O_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOPC__V_CMPX_O_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_U_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_U_F16::Inst_VOPC__V_CMPX_U_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_u_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_U_F16
-
-    Inst_VOPC__V_CMPX_U_F16::~Inst_VOPC__V_CMPX_U_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_U_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOPC__V_CMPX_U_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NGE_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_NGE_F16::Inst_VOPC__V_CMPX_NGE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nge_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NGE_F16
-
-    Inst_VOPC__V_CMPX_NGE_F16::~Inst_VOPC__V_CMPX_NGE_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NGE_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NLG_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_NLG_F16::Inst_VOPC__V_CMPX_NLG_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nlg_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NLG_F16
-
-    Inst_VOPC__V_CMPX_NLG_F16::~Inst_VOPC__V_CMPX_NLG_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLG_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NGT_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_NGT_F16::Inst_VOPC__V_CMPX_NGT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ngt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NGT_F16
-
-    Inst_VOPC__V_CMPX_NGT_F16::~Inst_VOPC__V_CMPX_NGT_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NGT_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NLE_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_NLE_F16::Inst_VOPC__V_CMPX_NLE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nle_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NLE_F16
-
-    Inst_VOPC__V_CMPX_NLE_F16::~Inst_VOPC__V_CMPX_NLE_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLE_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NEQ_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_NEQ_F16::Inst_VOPC__V_CMPX_NEQ_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_neq_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NEQ_F16
-
-    Inst_VOPC__V_CMPX_NEQ_F16::~Inst_VOPC__V_CMPX_NEQ_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NEQ_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NLT_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_NLT_F16::Inst_VOPC__V_CMPX_NLT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nlt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NLT_F16
-
-    Inst_VOPC__V_CMPX_NLT_F16::~Inst_VOPC__V_CMPX_NLT_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLT_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_TRU_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_TRU_F16::Inst_VOPC__V_CMPX_TRU_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_tru_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_TRU_F16
-
-    Inst_VOPC__V_CMPX_TRU_F16::~Inst_VOPC__V_CMPX_TRU_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_TRU_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_F_F32 class methods ---
-
-    Inst_VOPC__V_CMP_F_F32::Inst_VOPC__V_CMP_F_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_F_F32
-
-    Inst_VOPC__V_CMP_F_F32::~Inst_VOPC__V_CMP_F_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_F_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LT_F32 class methods ---
-
-    Inst_VOPC__V_CMP_LT_F32::Inst_VOPC__V_CMP_LT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_LT_F32
-
-    Inst_VOPC__V_CMP_LT_F32::~Inst_VOPC__V_CMP_LT_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_EQ_F32 class methods ---
-
-    Inst_VOPC__V_CMP_EQ_F32::Inst_VOPC__V_CMP_EQ_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_EQ_F32
-
-    Inst_VOPC__V_CMP_EQ_F32::~Inst_VOPC__V_CMP_EQ_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LE_F32 class methods ---
-
-    Inst_VOPC__V_CMP_LE_F32::Inst_VOPC__V_CMP_LE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_LE_F32
-
-    Inst_VOPC__V_CMP_LE_F32::~Inst_VOPC__V_CMP_LE_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GT_F32 class methods ---
-
-    Inst_VOPC__V_CMP_GT_F32::Inst_VOPC__V_CMP_GT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_GT_F32
-
-    Inst_VOPC__V_CMP_GT_F32::~Inst_VOPC__V_CMP_GT_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LG_F32 class methods ---
-
-    Inst_VOPC__V_CMP_LG_F32::Inst_VOPC__V_CMP_LG_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lg_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_LG_F32
-
-    Inst_VOPC__V_CMP_LG_F32::~Inst_VOPC__V_CMP_LG_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_LG_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GE_F32 class methods ---
-
-    Inst_VOPC__V_CMP_GE_F32::Inst_VOPC__V_CMP_GE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_GE_F32
-
-    Inst_VOPC__V_CMP_GE_F32::~Inst_VOPC__V_CMP_GE_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_O_F32 class methods ---
-
-    Inst_VOPC__V_CMP_O_F32::Inst_VOPC__V_CMP_O_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_o_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_O_F32
-
-    Inst_VOPC__V_CMP_O_F32::~Inst_VOPC__V_CMP_O_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_O_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_O_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_U_F32 class methods ---
-
-    Inst_VOPC__V_CMP_U_F32::Inst_VOPC__V_CMP_U_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_u_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_U_F32
-
-    Inst_VOPC__V_CMP_U_F32::~Inst_VOPC__V_CMP_U_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_U_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_U_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (std::isnan(src0[lane])
-                    || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NGE_F32 class methods ---
-
-    Inst_VOPC__V_CMP_NGE_F32::Inst_VOPC__V_CMP_NGE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nge_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_NGE_F32
-
-    Inst_VOPC__V_CMP_NGE_F32::~Inst_VOPC__V_CMP_NGE_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_NGE_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NLG_F32 class methods ---
-
-    Inst_VOPC__V_CMP_NLG_F32::Inst_VOPC__V_CMP_NLG_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nlg_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_NLG_F32
-
-    Inst_VOPC__V_CMP_NLG_F32::~Inst_VOPC__V_CMP_NLG_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_NLG_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NGT_F32 class methods ---
-
-    Inst_VOPC__V_CMP_NGT_F32::Inst_VOPC__V_CMP_NGT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ngt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_NGT_F32
-
-    Inst_VOPC__V_CMP_NGT_F32::~Inst_VOPC__V_CMP_NGT_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_NGT_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NLE_F32 class methods ---
-
-    Inst_VOPC__V_CMP_NLE_F32::Inst_VOPC__V_CMP_NLE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nle_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_NLE_F32
-
-    Inst_VOPC__V_CMP_NLE_F32::~Inst_VOPC__V_CMP_NLE_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_NLE_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NEQ_F32 class methods ---
-
-    Inst_VOPC__V_CMP_NEQ_F32::Inst_VOPC__V_CMP_NEQ_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_neq_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_NEQ_F32
-
-    Inst_VOPC__V_CMP_NEQ_F32::~Inst_VOPC__V_CMP_NEQ_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_NEQ_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NLT_F32 class methods ---
-
-    Inst_VOPC__V_CMP_NLT_F32::Inst_VOPC__V_CMP_NLT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nlt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_NLT_F32
-
-    Inst_VOPC__V_CMP_NLT_F32::~Inst_VOPC__V_CMP_NLT_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_NLT_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_TRU_F32 class methods ---
-
-    Inst_VOPC__V_CMP_TRU_F32::Inst_VOPC__V_CMP_TRU_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_tru_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_TRU_F32
-
-    Inst_VOPC__V_CMP_TRU_F32::~Inst_VOPC__V_CMP_TRU_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_TRU_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_F_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_F_F32::Inst_VOPC__V_CMPX_F_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_F_F32
-
-    Inst_VOPC__V_CMPX_F_F32::~Inst_VOPC__V_CMPX_F_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LT_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_LT_F32::Inst_VOPC__V_CMPX_LT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LT_F32
-
-    Inst_VOPC__V_CMPX_LT_F32::~Inst_VOPC__V_CMPX_LT_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_EQ_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_EQ_F32::Inst_VOPC__V_CMPX_EQ_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_EQ_F32
-
-    Inst_VOPC__V_CMPX_EQ_F32::~Inst_VOPC__V_CMPX_EQ_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LE_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_LE_F32::Inst_VOPC__V_CMPX_LE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LE_F32
-
-    Inst_VOPC__V_CMPX_LE_F32::~Inst_VOPC__V_CMPX_LE_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GT_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_GT_F32::Inst_VOPC__V_CMPX_GT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GT_F32
-
-    Inst_VOPC__V_CMPX_GT_F32::~Inst_VOPC__V_CMPX_GT_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LG_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_LG_F32::Inst_VOPC__V_CMPX_LG_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lg_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LG_F32
-
-    Inst_VOPC__V_CMPX_LG_F32::~Inst_VOPC__V_CMPX_LG_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_LG_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GE_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_GE_F32::Inst_VOPC__V_CMPX_GE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GE_F32
-
-    Inst_VOPC__V_CMPX_GE_F32::~Inst_VOPC__V_CMPX_GE_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_O_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_O_F32::Inst_VOPC__V_CMPX_O_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_o_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_O_F32
-
-    Inst_VOPC__V_CMPX_O_F32::~Inst_VOPC__V_CMPX_O_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_O_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOPC__V_CMPX_O_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_U_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_U_F32::Inst_VOPC__V_CMPX_U_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_u_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_U_F32
-
-    Inst_VOPC__V_CMPX_U_F32::~Inst_VOPC__V_CMPX_U_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_U_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOPC__V_CMPX_U_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (std::isnan(src0[lane])
-                    || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NGE_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_NGE_F32::Inst_VOPC__V_CMPX_NGE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nge_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NGE_F32
-
-    Inst_VOPC__V_CMPX_NGE_F32::~Inst_VOPC__V_CMPX_NGE_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NGE_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NLG_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_NLG_F32::Inst_VOPC__V_CMPX_NLG_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nlg_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NLG_F32
-
-    Inst_VOPC__V_CMPX_NLG_F32::~Inst_VOPC__V_CMPX_NLG_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLG_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NGT_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_NGT_F32::Inst_VOPC__V_CMPX_NGT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ngt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NGT_F32
-
-    Inst_VOPC__V_CMPX_NGT_F32::~Inst_VOPC__V_CMPX_NGT_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NGT_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NLE_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_NLE_F32::Inst_VOPC__V_CMPX_NLE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nle_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NLE_F32
-
-    Inst_VOPC__V_CMPX_NLE_F32::~Inst_VOPC__V_CMPX_NLE_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLE_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NEQ_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_NEQ_F32::Inst_VOPC__V_CMPX_NEQ_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_neq_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NEQ_F32
-
-    Inst_VOPC__V_CMPX_NEQ_F32::~Inst_VOPC__V_CMPX_NEQ_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NEQ_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] == src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NLT_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_NLT_F32::Inst_VOPC__V_CMPX_NLT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nlt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NLT_F32
-
-    Inst_VOPC__V_CMPX_NLT_F32::~Inst_VOPC__V_CMPX_NLT_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLT_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_TRU_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_TRU_F32::Inst_VOPC__V_CMPX_TRU_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_tru_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_TRU_F32
-
-    Inst_VOPC__V_CMPX_TRU_F32::~Inst_VOPC__V_CMPX_TRU_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_TRU_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMP_F_F64 class methods ---
-
-    Inst_VOPC__V_CMP_F_F64::Inst_VOPC__V_CMP_F_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_F_F64
-
-    Inst_VOPC__V_CMP_F_F64::~Inst_VOPC__V_CMP_F_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_F_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LT_F64 class methods ---
-
-    Inst_VOPC__V_CMP_LT_F64::Inst_VOPC__V_CMP_LT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_LT_F64
-
-    Inst_VOPC__V_CMP_LT_F64::~Inst_VOPC__V_CMP_LT_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_EQ_F64 class methods ---
-
-    Inst_VOPC__V_CMP_EQ_F64::Inst_VOPC__V_CMP_EQ_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_EQ_F64
-
-    Inst_VOPC__V_CMP_EQ_F64::~Inst_VOPC__V_CMP_EQ_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LE_F64 class methods ---
-
-    Inst_VOPC__V_CMP_LE_F64::Inst_VOPC__V_CMP_LE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_LE_F64
-
-    Inst_VOPC__V_CMP_LE_F64::~Inst_VOPC__V_CMP_LE_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GT_F64 class methods ---
-
-    Inst_VOPC__V_CMP_GT_F64::Inst_VOPC__V_CMP_GT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_GT_F64
-
-    Inst_VOPC__V_CMP_GT_F64::~Inst_VOPC__V_CMP_GT_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LG_F64 class methods ---
-
-    Inst_VOPC__V_CMP_LG_F64::Inst_VOPC__V_CMP_LG_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lg_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_LG_F64
-
-    Inst_VOPC__V_CMP_LG_F64::~Inst_VOPC__V_CMP_LG_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_LG_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GE_F64 class methods ---
-
-    Inst_VOPC__V_CMP_GE_F64::Inst_VOPC__V_CMP_GE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_GE_F64
-
-    Inst_VOPC__V_CMP_GE_F64::~Inst_VOPC__V_CMP_GE_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_O_F64 class methods ---
-
-    Inst_VOPC__V_CMP_O_F64::Inst_VOPC__V_CMP_O_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_o_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_O_F64
-
-    Inst_VOPC__V_CMP_O_F64::~Inst_VOPC__V_CMP_O_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_O_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_O_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_U_F64 class methods ---
-
-    Inst_VOPC__V_CMP_U_F64::Inst_VOPC__V_CMP_U_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_u_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_U_F64
-
-    Inst_VOPC__V_CMP_U_F64::~Inst_VOPC__V_CMP_U_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_U_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_U_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (std::isnan(src0[lane])
-                    || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NGE_F64 class methods ---
-
-    Inst_VOPC__V_CMP_NGE_F64::Inst_VOPC__V_CMP_NGE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nge_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_NGE_F64
-
-    Inst_VOPC__V_CMP_NGE_F64::~Inst_VOPC__V_CMP_NGE_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_NGE_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NLG_F64 class methods ---
-
-    Inst_VOPC__V_CMP_NLG_F64::Inst_VOPC__V_CMP_NLG_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nlg_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_NLG_F64
-
-    Inst_VOPC__V_CMP_NLG_F64::~Inst_VOPC__V_CMP_NLG_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_NLG_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NGT_F64 class methods ---
-
-    Inst_VOPC__V_CMP_NGT_F64::Inst_VOPC__V_CMP_NGT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ngt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_NGT_F64
-
-    Inst_VOPC__V_CMP_NGT_F64::~Inst_VOPC__V_CMP_NGT_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_NGT_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NLE_F64 class methods ---
-
-    Inst_VOPC__V_CMP_NLE_F64::Inst_VOPC__V_CMP_NLE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nle_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_NLE_F64
-
-    Inst_VOPC__V_CMP_NLE_F64::~Inst_VOPC__V_CMP_NLE_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_NLE_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NEQ_F64 class methods ---
-
-    Inst_VOPC__V_CMP_NEQ_F64::Inst_VOPC__V_CMP_NEQ_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_neq_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_NEQ_F64
-
-    Inst_VOPC__V_CMP_NEQ_F64::~Inst_VOPC__V_CMP_NEQ_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_NEQ_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NLT_F64 class methods ---
-
-    Inst_VOPC__V_CMP_NLT_F64::Inst_VOPC__V_CMP_NLT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nlt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_NLT_F64
-
-    Inst_VOPC__V_CMP_NLT_F64::~Inst_VOPC__V_CMP_NLT_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_NLT_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_TRU_F64 class methods ---
-
-    Inst_VOPC__V_CMP_TRU_F64::Inst_VOPC__V_CMP_TRU_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_tru_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_TRU_F64
-
-    Inst_VOPC__V_CMP_TRU_F64::~Inst_VOPC__V_CMP_TRU_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_TRU_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_F_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_F_F64::Inst_VOPC__V_CMPX_F_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_F_F64
-
-    Inst_VOPC__V_CMPX_F_F64::~Inst_VOPC__V_CMPX_F_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LT_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_LT_F64::Inst_VOPC__V_CMPX_LT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LT_F64
-
-    Inst_VOPC__V_CMPX_LT_F64::~Inst_VOPC__V_CMPX_LT_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_EQ_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_EQ_F64::Inst_VOPC__V_CMPX_EQ_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_EQ_F64
-
-    Inst_VOPC__V_CMPX_EQ_F64::~Inst_VOPC__V_CMPX_EQ_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LE_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_LE_F64::Inst_VOPC__V_CMPX_LE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LE_F64
-
-    Inst_VOPC__V_CMPX_LE_F64::~Inst_VOPC__V_CMPX_LE_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GT_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_GT_F64::Inst_VOPC__V_CMPX_GT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GT_F64
-
-    Inst_VOPC__V_CMPX_GT_F64::~Inst_VOPC__V_CMPX_GT_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LG_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_LG_F64::Inst_VOPC__V_CMPX_LG_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lg_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LG_F64
-
-    Inst_VOPC__V_CMPX_LG_F64::~Inst_VOPC__V_CMPX_LG_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_LG_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GE_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_GE_F64::Inst_VOPC__V_CMPX_GE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GE_F64
-
-    Inst_VOPC__V_CMPX_GE_F64::~Inst_VOPC__V_CMPX_GE_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_O_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_O_F64::Inst_VOPC__V_CMPX_O_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_o_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_O_F64
-
-    Inst_VOPC__V_CMPX_O_F64::~Inst_VOPC__V_CMPX_O_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_O_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOPC__V_CMPX_O_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_U_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_U_F64::Inst_VOPC__V_CMPX_U_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_u_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_U_F64
-
-    Inst_VOPC__V_CMPX_U_F64::~Inst_VOPC__V_CMPX_U_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_U_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOPC__V_CMPX_U_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (std::isnan(src0[lane])
-                    || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NGE_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_NGE_F64::Inst_VOPC__V_CMPX_NGE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nge_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NGE_F64
-
-    Inst_VOPC__V_CMPX_NGE_F64::~Inst_VOPC__V_CMPX_NGE_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NGE_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NLG_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_NLG_F64::Inst_VOPC__V_CMPX_NLG_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nlg_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NLG_F64
-
-    Inst_VOPC__V_CMPX_NLG_F64::~Inst_VOPC__V_CMPX_NLG_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLG_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NGT_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_NGT_F64::Inst_VOPC__V_CMPX_NGT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ngt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NGT_F64
-
-    Inst_VOPC__V_CMPX_NGT_F64::~Inst_VOPC__V_CMPX_NGT_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NGT_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NLE_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_NLE_F64::Inst_VOPC__V_CMPX_NLE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nle_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NLE_F64
-
-    Inst_VOPC__V_CMPX_NLE_F64::~Inst_VOPC__V_CMPX_NLE_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLE_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NEQ_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_NEQ_F64::Inst_VOPC__V_CMPX_NEQ_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_neq_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NEQ_F64
-
-    Inst_VOPC__V_CMPX_NEQ_F64::~Inst_VOPC__V_CMPX_NEQ_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NEQ_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NLT_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_NLT_F64::Inst_VOPC__V_CMPX_NLT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nlt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NLT_F64
-
-    Inst_VOPC__V_CMPX_NLT_F64::~Inst_VOPC__V_CMPX_NLT_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLT_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_TRU_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_TRU_F64::Inst_VOPC__V_CMPX_TRU_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_tru_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_TRU_F64
-
-    Inst_VOPC__V_CMPX_TRU_F64::~Inst_VOPC__V_CMPX_TRU_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_TRU_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_F_I16 class methods ---
-
-    Inst_VOPC__V_CMP_F_I16::Inst_VOPC__V_CMP_F_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_F_I16
-
-    Inst_VOPC__V_CMP_F_I16::~Inst_VOPC__V_CMP_F_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_F_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LT_I16 class methods ---
-
-    Inst_VOPC__V_CMP_LT_I16::Inst_VOPC__V_CMP_LT_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LT_I16
-
-    Inst_VOPC__V_CMP_LT_I16::~Inst_VOPC__V_CMP_LT_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_EQ_I16 class methods ---
-
-    Inst_VOPC__V_CMP_EQ_I16::Inst_VOPC__V_CMP_EQ_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_EQ_I16
-
-    Inst_VOPC__V_CMP_EQ_I16::~Inst_VOPC__V_CMP_EQ_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LE_I16 class methods ---
-
-    Inst_VOPC__V_CMP_LE_I16::Inst_VOPC__V_CMP_LE_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LE_I16
-
-    Inst_VOPC__V_CMP_LE_I16::~Inst_VOPC__V_CMP_LE_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GT_I16 class methods ---
-
-    Inst_VOPC__V_CMP_GT_I16::Inst_VOPC__V_CMP_GT_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GT_I16
-
-    Inst_VOPC__V_CMP_GT_I16::~Inst_VOPC__V_CMP_GT_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NE_I16 class methods ---
-
-    Inst_VOPC__V_CMP_NE_I16::Inst_VOPC__V_CMP_NE_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ne_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_NE_I16
-
-    Inst_VOPC__V_CMP_NE_I16::~Inst_VOPC__V_CMP_NE_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_NE_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GE_I16 class methods ---
-
-    Inst_VOPC__V_CMP_GE_I16::Inst_VOPC__V_CMP_GE_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GE_I16
-
-    Inst_VOPC__V_CMP_GE_I16::~Inst_VOPC__V_CMP_GE_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_T_I16 class methods ---
-
-    Inst_VOPC__V_CMP_T_I16::Inst_VOPC__V_CMP_T_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_t_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_T_I16
-
-    Inst_VOPC__V_CMP_T_I16::~Inst_VOPC__V_CMP_T_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_T_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_T_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_F_U16 class methods ---
-
-    Inst_VOPC__V_CMP_F_U16::Inst_VOPC__V_CMP_F_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_F_U16
-
-    Inst_VOPC__V_CMP_F_U16::~Inst_VOPC__V_CMP_F_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_F_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LT_U16 class methods ---
-
-    Inst_VOPC__V_CMP_LT_U16::Inst_VOPC__V_CMP_LT_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LT_U16
-
-    Inst_VOPC__V_CMP_LT_U16::~Inst_VOPC__V_CMP_LT_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_EQ_U16 class methods ---
-
-    Inst_VOPC__V_CMP_EQ_U16::Inst_VOPC__V_CMP_EQ_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_EQ_U16
-
-    Inst_VOPC__V_CMP_EQ_U16::~Inst_VOPC__V_CMP_EQ_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LE_U16 class methods ---
-
-    Inst_VOPC__V_CMP_LE_U16::Inst_VOPC__V_CMP_LE_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LE_U16
-
-    Inst_VOPC__V_CMP_LE_U16::~Inst_VOPC__V_CMP_LE_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GT_U16 class methods ---
-
-    Inst_VOPC__V_CMP_GT_U16::Inst_VOPC__V_CMP_GT_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GT_U16
-
-    Inst_VOPC__V_CMP_GT_U16::~Inst_VOPC__V_CMP_GT_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NE_U16 class methods ---
-
-    Inst_VOPC__V_CMP_NE_U16::Inst_VOPC__V_CMP_NE_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ne_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_NE_U16
-
-    Inst_VOPC__V_CMP_NE_U16::~Inst_VOPC__V_CMP_NE_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_NE_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GE_U16 class methods ---
-
-    Inst_VOPC__V_CMP_GE_U16::Inst_VOPC__V_CMP_GE_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GE_U16
-
-    Inst_VOPC__V_CMP_GE_U16::~Inst_VOPC__V_CMP_GE_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_T_U16 class methods ---
-
-    Inst_VOPC__V_CMP_T_U16::Inst_VOPC__V_CMP_T_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_t_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_T_U16
-
-    Inst_VOPC__V_CMP_T_U16::~Inst_VOPC__V_CMP_T_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_T_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_T_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_F_I16 class methods ---
-
-    Inst_VOPC__V_CMPX_F_I16::Inst_VOPC__V_CMPX_F_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_i16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_F_I16
-
-    Inst_VOPC__V_CMPX_F_I16::~Inst_VOPC__V_CMPX_F_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LT_I16 class methods ---
-
-    Inst_VOPC__V_CMPX_LT_I16::Inst_VOPC__V_CMPX_LT_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_i16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LT_I16
-
-    Inst_VOPC__V_CMPX_LT_I16::~Inst_VOPC__V_CMPX_LT_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_EQ_I16 class methods ---
-
-    Inst_VOPC__V_CMPX_EQ_I16::Inst_VOPC__V_CMPX_EQ_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_i16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_EQ_I16
-
-    Inst_VOPC__V_CMPX_EQ_I16::~Inst_VOPC__V_CMPX_EQ_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LE_I16 class methods ---
-
-    Inst_VOPC__V_CMPX_LE_I16::Inst_VOPC__V_CMPX_LE_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_i16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LE_I16
-
-    Inst_VOPC__V_CMPX_LE_I16::~Inst_VOPC__V_CMPX_LE_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GT_I16 class methods ---
-
-    Inst_VOPC__V_CMPX_GT_I16::Inst_VOPC__V_CMPX_GT_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_i16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GT_I16
-
-    Inst_VOPC__V_CMPX_GT_I16::~Inst_VOPC__V_CMPX_GT_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NE_I16 class methods ---
-
-    Inst_VOPC__V_CMPX_NE_I16::Inst_VOPC__V_CMPX_NE_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ne_i16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NE_I16
-
-    Inst_VOPC__V_CMPX_NE_I16::~Inst_VOPC__V_CMPX_NE_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NE_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GE_I16 class methods ---
-
-    Inst_VOPC__V_CMPX_GE_I16::Inst_VOPC__V_CMPX_GE_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_i16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GE_I16
-
-    Inst_VOPC__V_CMPX_GE_I16::~Inst_VOPC__V_CMPX_GE_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_T_I16 class methods ---
-
-    Inst_VOPC__V_CMPX_T_I16::Inst_VOPC__V_CMPX_T_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_t_i16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_T_I16
-
-    Inst_VOPC__V_CMPX_T_I16::~Inst_VOPC__V_CMPX_T_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_T_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_T_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_F_U16 class methods ---
-
-    Inst_VOPC__V_CMPX_F_U16::Inst_VOPC__V_CMPX_F_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_u16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_F_U16
-
-    Inst_VOPC__V_CMPX_F_U16::~Inst_VOPC__V_CMPX_F_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LT_U16 class methods ---
-
-    Inst_VOPC__V_CMPX_LT_U16::Inst_VOPC__V_CMPX_LT_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_u16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LT_U16
-
-    Inst_VOPC__V_CMPX_LT_U16::~Inst_VOPC__V_CMPX_LT_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_EQ_U16 class methods ---
-
-    Inst_VOPC__V_CMPX_EQ_U16::Inst_VOPC__V_CMPX_EQ_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_u16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_EQ_U16
-
-    Inst_VOPC__V_CMPX_EQ_U16::~Inst_VOPC__V_CMPX_EQ_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LE_U16 class methods ---
-
-    Inst_VOPC__V_CMPX_LE_U16::Inst_VOPC__V_CMPX_LE_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_u16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LE_U16
-
-    Inst_VOPC__V_CMPX_LE_U16::~Inst_VOPC__V_CMPX_LE_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GT_U16 class methods ---
-
-    Inst_VOPC__V_CMPX_GT_U16::Inst_VOPC__V_CMPX_GT_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_u16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GT_U16
-
-    Inst_VOPC__V_CMPX_GT_U16::~Inst_VOPC__V_CMPX_GT_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NE_U16 class methods ---
-
-    Inst_VOPC__V_CMPX_NE_U16::Inst_VOPC__V_CMPX_NE_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ne_u16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NE_U16
-
-    Inst_VOPC__V_CMPX_NE_U16::~Inst_VOPC__V_CMPX_NE_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NE_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GE_U16 class methods ---
-
-    Inst_VOPC__V_CMPX_GE_U16::Inst_VOPC__V_CMPX_GE_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_u16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GE_U16
-
-    Inst_VOPC__V_CMPX_GE_U16::~Inst_VOPC__V_CMPX_GE_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_T_U16 class methods ---
-
-    Inst_VOPC__V_CMPX_T_U16::Inst_VOPC__V_CMPX_T_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_t_u16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_T_U16
-
-    Inst_VOPC__V_CMPX_T_U16::~Inst_VOPC__V_CMPX_T_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_T_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_T_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_F_I32 class methods ---
-
-    Inst_VOPC__V_CMP_F_I32::Inst_VOPC__V_CMP_F_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_F_I32
-
-    Inst_VOPC__V_CMP_F_I32::~Inst_VOPC__V_CMP_F_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_F_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LT_I32 class methods ---
-
-    Inst_VOPC__V_CMP_LT_I32::Inst_VOPC__V_CMP_LT_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LT_I32
-
-    Inst_VOPC__V_CMP_LT_I32::~Inst_VOPC__V_CMP_LT_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_EQ_I32 class methods ---
-
-    Inst_VOPC__V_CMP_EQ_I32::Inst_VOPC__V_CMP_EQ_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_EQ_I32
-
-    Inst_VOPC__V_CMP_EQ_I32::~Inst_VOPC__V_CMP_EQ_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LE_I32 class methods ---
-
-    Inst_VOPC__V_CMP_LE_I32::Inst_VOPC__V_CMP_LE_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LE_I32
-
-    Inst_VOPC__V_CMP_LE_I32::~Inst_VOPC__V_CMP_LE_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GT_I32 class methods ---
-
-    Inst_VOPC__V_CMP_GT_I32::Inst_VOPC__V_CMP_GT_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GT_I32
-
-    Inst_VOPC__V_CMP_GT_I32::~Inst_VOPC__V_CMP_GT_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NE_I32 class methods ---
-
-    Inst_VOPC__V_CMP_NE_I32::Inst_VOPC__V_CMP_NE_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ne_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_NE_I32
-
-    Inst_VOPC__V_CMP_NE_I32::~Inst_VOPC__V_CMP_NE_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_NE_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GE_I32 class methods ---
-
-    Inst_VOPC__V_CMP_GE_I32::Inst_VOPC__V_CMP_GE_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GE_I32
-
-    Inst_VOPC__V_CMP_GE_I32::~Inst_VOPC__V_CMP_GE_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_T_I32 class methods ---
-
-    Inst_VOPC__V_CMP_T_I32::Inst_VOPC__V_CMP_T_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_t_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_T_I32
-
-    Inst_VOPC__V_CMP_T_I32::~Inst_VOPC__V_CMP_T_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_T_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_T_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_F_U32 class methods ---
-
-    Inst_VOPC__V_CMP_F_U32::Inst_VOPC__V_CMP_F_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_F_U32
-
-    Inst_VOPC__V_CMP_F_U32::~Inst_VOPC__V_CMP_F_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_F_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LT_U32 class methods ---
-
-    Inst_VOPC__V_CMP_LT_U32::Inst_VOPC__V_CMP_LT_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LT_U32
-
-    Inst_VOPC__V_CMP_LT_U32::~Inst_VOPC__V_CMP_LT_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_EQ_U32 class methods ---
-
-    Inst_VOPC__V_CMP_EQ_U32::Inst_VOPC__V_CMP_EQ_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_EQ_U32
-
-    Inst_VOPC__V_CMP_EQ_U32::~Inst_VOPC__V_CMP_EQ_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LE_U32 class methods ---
-
-    Inst_VOPC__V_CMP_LE_U32::Inst_VOPC__V_CMP_LE_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LE_U32
-
-    Inst_VOPC__V_CMP_LE_U32::~Inst_VOPC__V_CMP_LE_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GT_U32 class methods ---
-
-    Inst_VOPC__V_CMP_GT_U32::Inst_VOPC__V_CMP_GT_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GT_U32
-
-    Inst_VOPC__V_CMP_GT_U32::~Inst_VOPC__V_CMP_GT_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NE_U32 class methods ---
-
-    Inst_VOPC__V_CMP_NE_U32::Inst_VOPC__V_CMP_NE_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ne_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_NE_U32
-
-    Inst_VOPC__V_CMP_NE_U32::~Inst_VOPC__V_CMP_NE_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_NE_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GE_U32 class methods ---
-
-    Inst_VOPC__V_CMP_GE_U32::Inst_VOPC__V_CMP_GE_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GE_U32
-
-    Inst_VOPC__V_CMP_GE_U32::~Inst_VOPC__V_CMP_GE_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_T_U32 class methods ---
-
-    Inst_VOPC__V_CMP_T_U32::Inst_VOPC__V_CMP_T_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_t_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_T_U32
-
-    Inst_VOPC__V_CMP_T_U32::~Inst_VOPC__V_CMP_T_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_T_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_T_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_F_I32 class methods ---
-
-    Inst_VOPC__V_CMPX_F_I32::Inst_VOPC__V_CMPX_F_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_i32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_F_I32
-
-    Inst_VOPC__V_CMPX_F_I32::~Inst_VOPC__V_CMPX_F_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LT_I32 class methods ---
-
-    Inst_VOPC__V_CMPX_LT_I32::Inst_VOPC__V_CMPX_LT_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_i32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LT_I32
-
-    Inst_VOPC__V_CMPX_LT_I32::~Inst_VOPC__V_CMPX_LT_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_EQ_I32 class methods ---
-
-    Inst_VOPC__V_CMPX_EQ_I32::Inst_VOPC__V_CMPX_EQ_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_i32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_EQ_I32
-
-    Inst_VOPC__V_CMPX_EQ_I32::~Inst_VOPC__V_CMPX_EQ_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LE_I32 class methods ---
-
-    Inst_VOPC__V_CMPX_LE_I32::Inst_VOPC__V_CMPX_LE_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_i32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LE_I32
-
-    Inst_VOPC__V_CMPX_LE_I32::~Inst_VOPC__V_CMPX_LE_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GT_I32 class methods ---
-
-    Inst_VOPC__V_CMPX_GT_I32::Inst_VOPC__V_CMPX_GT_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_i32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GT_I32
-
-    Inst_VOPC__V_CMPX_GT_I32::~Inst_VOPC__V_CMPX_GT_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NE_I32 class methods ---
-
-    Inst_VOPC__V_CMPX_NE_I32::Inst_VOPC__V_CMPX_NE_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ne_i32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NE_I32
-
-    Inst_VOPC__V_CMPX_NE_I32::~Inst_VOPC__V_CMPX_NE_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NE_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GE_I32 class methods ---
-
-    Inst_VOPC__V_CMPX_GE_I32::Inst_VOPC__V_CMPX_GE_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_i32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GE_I32
-
-    Inst_VOPC__V_CMPX_GE_I32::~Inst_VOPC__V_CMPX_GE_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_T_I32 class methods ---
-
-    Inst_VOPC__V_CMPX_T_I32::Inst_VOPC__V_CMPX_T_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_t_i32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_T_I32
-
-    Inst_VOPC__V_CMPX_T_I32::~Inst_VOPC__V_CMPX_T_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_T_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_T_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_F_U32 class methods ---
-
-    Inst_VOPC__V_CMPX_F_U32::Inst_VOPC__V_CMPX_F_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_F_U32
-
-    Inst_VOPC__V_CMPX_F_U32::~Inst_VOPC__V_CMPX_F_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LT_U32 class methods ---
-
-    Inst_VOPC__V_CMPX_LT_U32::Inst_VOPC__V_CMPX_LT_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LT_U32
-
-    Inst_VOPC__V_CMPX_LT_U32::~Inst_VOPC__V_CMPX_LT_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_EQ_U32 class methods ---
-
-    Inst_VOPC__V_CMPX_EQ_U32::Inst_VOPC__V_CMPX_EQ_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_EQ_U32
-
-    Inst_VOPC__V_CMPX_EQ_U32::~Inst_VOPC__V_CMPX_EQ_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LE_U32 class methods ---
-
-    Inst_VOPC__V_CMPX_LE_U32::Inst_VOPC__V_CMPX_LE_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LE_U32
-
-    Inst_VOPC__V_CMPX_LE_U32::~Inst_VOPC__V_CMPX_LE_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GT_U32 class methods ---
-
-    Inst_VOPC__V_CMPX_GT_U32::Inst_VOPC__V_CMPX_GT_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GT_U32
-
-    Inst_VOPC__V_CMPX_GT_U32::~Inst_VOPC__V_CMPX_GT_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NE_U32 class methods ---
-
-    Inst_VOPC__V_CMPX_NE_U32::Inst_VOPC__V_CMPX_NE_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ne_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NE_U32
-
-    Inst_VOPC__V_CMPX_NE_U32::~Inst_VOPC__V_CMPX_NE_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NE_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GE_U32 class methods ---
-
-    Inst_VOPC__V_CMPX_GE_U32::Inst_VOPC__V_CMPX_GE_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GE_U32
-
-    Inst_VOPC__V_CMPX_GE_U32::~Inst_VOPC__V_CMPX_GE_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_T_U32 class methods ---
-
-    Inst_VOPC__V_CMPX_T_U32::Inst_VOPC__V_CMPX_T_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_t_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_T_U32
-
-    Inst_VOPC__V_CMPX_T_U32::~Inst_VOPC__V_CMPX_T_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_T_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_T_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_F_I64 class methods ---
-
-    Inst_VOPC__V_CMP_F_I64::Inst_VOPC__V_CMP_F_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_F_I64
-
-    Inst_VOPC__V_CMP_F_I64::~Inst_VOPC__V_CMP_F_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_F_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LT_I64 class methods ---
-
-    Inst_VOPC__V_CMP_LT_I64::Inst_VOPC__V_CMP_LT_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LT_I64
-
-    Inst_VOPC__V_CMP_LT_I64::~Inst_VOPC__V_CMP_LT_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_EQ_I64 class methods ---
-
-    Inst_VOPC__V_CMP_EQ_I64::Inst_VOPC__V_CMP_EQ_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_EQ_I64
-
-    Inst_VOPC__V_CMP_EQ_I64::~Inst_VOPC__V_CMP_EQ_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LE_I64 class methods ---
-
-    Inst_VOPC__V_CMP_LE_I64::Inst_VOPC__V_CMP_LE_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LE_I64
-
-    Inst_VOPC__V_CMP_LE_I64::~Inst_VOPC__V_CMP_LE_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GT_I64 class methods ---
-
-    Inst_VOPC__V_CMP_GT_I64::Inst_VOPC__V_CMP_GT_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GT_I64
-
-    Inst_VOPC__V_CMP_GT_I64::~Inst_VOPC__V_CMP_GT_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NE_I64 class methods ---
-
-    Inst_VOPC__V_CMP_NE_I64::Inst_VOPC__V_CMP_NE_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ne_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_NE_I64
-
-    Inst_VOPC__V_CMP_NE_I64::~Inst_VOPC__V_CMP_NE_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_NE_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GE_I64 class methods ---
-
-    Inst_VOPC__V_CMP_GE_I64::Inst_VOPC__V_CMP_GE_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GE_I64
-
-    Inst_VOPC__V_CMP_GE_I64::~Inst_VOPC__V_CMP_GE_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_T_I64 class methods ---
-
-    Inst_VOPC__V_CMP_T_I64::Inst_VOPC__V_CMP_T_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_t_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_T_I64
-
-    Inst_VOPC__V_CMP_T_I64::~Inst_VOPC__V_CMP_T_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_T_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_T_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_F_U64 class methods ---
-
-    Inst_VOPC__V_CMP_F_U64::Inst_VOPC__V_CMP_F_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_F_U64
-
-    Inst_VOPC__V_CMP_F_U64::~Inst_VOPC__V_CMP_F_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_F_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LT_U64 class methods ---
-
-    Inst_VOPC__V_CMP_LT_U64::Inst_VOPC__V_CMP_LT_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LT_U64
-
-    Inst_VOPC__V_CMP_LT_U64::~Inst_VOPC__V_CMP_LT_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_EQ_U64 class methods ---
-
-    Inst_VOPC__V_CMP_EQ_U64::Inst_VOPC__V_CMP_EQ_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_EQ_U64
-
-    Inst_VOPC__V_CMP_EQ_U64::~Inst_VOPC__V_CMP_EQ_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LE_U64 class methods ---
-
-    Inst_VOPC__V_CMP_LE_U64::Inst_VOPC__V_CMP_LE_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LE_U64
-
-    Inst_VOPC__V_CMP_LE_U64::~Inst_VOPC__V_CMP_LE_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GT_U64 class methods ---
-
-    Inst_VOPC__V_CMP_GT_U64::Inst_VOPC__V_CMP_GT_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GT_U64
-
-    Inst_VOPC__V_CMP_GT_U64::~Inst_VOPC__V_CMP_GT_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NE_U64 class methods ---
-
-    Inst_VOPC__V_CMP_NE_U64::Inst_VOPC__V_CMP_NE_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ne_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_NE_U64
-
-    Inst_VOPC__V_CMP_NE_U64::~Inst_VOPC__V_CMP_NE_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_NE_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GE_U64 class methods ---
-
-    Inst_VOPC__V_CMP_GE_U64::Inst_VOPC__V_CMP_GE_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GE_U64
-
-    Inst_VOPC__V_CMP_GE_U64::~Inst_VOPC__V_CMP_GE_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_T_U64 class methods ---
-
-    Inst_VOPC__V_CMP_T_U64::Inst_VOPC__V_CMP_T_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_t_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_T_U64
-
-    Inst_VOPC__V_CMP_T_U64::~Inst_VOPC__V_CMP_T_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_T_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_T_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_F_I64 class methods ---
-
-    Inst_VOPC__V_CMPX_F_I64::Inst_VOPC__V_CMPX_F_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_i64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_F_I64
-
-    Inst_VOPC__V_CMPX_F_I64::~Inst_VOPC__V_CMPX_F_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LT_I64 class methods ---
-
-    Inst_VOPC__V_CMPX_LT_I64::Inst_VOPC__V_CMPX_LT_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_i64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LT_I64
-
-    Inst_VOPC__V_CMPX_LT_I64::~Inst_VOPC__V_CMPX_LT_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_EQ_I64 class methods ---
-
-    Inst_VOPC__V_CMPX_EQ_I64::Inst_VOPC__V_CMPX_EQ_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_i64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_EQ_I64
-
-    Inst_VOPC__V_CMPX_EQ_I64::~Inst_VOPC__V_CMPX_EQ_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LE_I64 class methods ---
-
-    Inst_VOPC__V_CMPX_LE_I64::Inst_VOPC__V_CMPX_LE_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_i64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LE_I64
-
-    Inst_VOPC__V_CMPX_LE_I64::~Inst_VOPC__V_CMPX_LE_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GT_I64 class methods ---
-
-    Inst_VOPC__V_CMPX_GT_I64::Inst_VOPC__V_CMPX_GT_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_i64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GT_I64
-
-    Inst_VOPC__V_CMPX_GT_I64::~Inst_VOPC__V_CMPX_GT_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NE_I64 class methods ---
-
-    Inst_VOPC__V_CMPX_NE_I64::Inst_VOPC__V_CMPX_NE_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ne_i64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NE_I64
-
-    Inst_VOPC__V_CMPX_NE_I64::~Inst_VOPC__V_CMPX_NE_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NE_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GE_I64 class methods ---
-
-    Inst_VOPC__V_CMPX_GE_I64::Inst_VOPC__V_CMPX_GE_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_i64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GE_I64
-
-    Inst_VOPC__V_CMPX_GE_I64::~Inst_VOPC__V_CMPX_GE_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_T_I64 class methods ---
-
-    Inst_VOPC__V_CMPX_T_I64::Inst_VOPC__V_CMPX_T_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_t_i64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_T_I64
-
-    Inst_VOPC__V_CMPX_T_I64::~Inst_VOPC__V_CMPX_T_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_T_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_T_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_F_U64 class methods ---
-
-    Inst_VOPC__V_CMPX_F_U64::Inst_VOPC__V_CMPX_F_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_u64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_F_U64
-
-    Inst_VOPC__V_CMPX_F_U64::~Inst_VOPC__V_CMPX_F_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LT_U64 class methods ---
-
-    Inst_VOPC__V_CMPX_LT_U64::Inst_VOPC__V_CMPX_LT_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_u64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LT_U64
-
-    Inst_VOPC__V_CMPX_LT_U64::~Inst_VOPC__V_CMPX_LT_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_EQ_U64 class methods ---
-
-    Inst_VOPC__V_CMPX_EQ_U64::Inst_VOPC__V_CMPX_EQ_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_u64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_EQ_U64
-
-    Inst_VOPC__V_CMPX_EQ_U64::~Inst_VOPC__V_CMPX_EQ_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LE_U64 class methods ---
-
-    Inst_VOPC__V_CMPX_LE_U64::Inst_VOPC__V_CMPX_LE_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_u64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LE_U64
-
-    Inst_VOPC__V_CMPX_LE_U64::~Inst_VOPC__V_CMPX_LE_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GT_U64 class methods ---
-
-    Inst_VOPC__V_CMPX_GT_U64::Inst_VOPC__V_CMPX_GT_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_u64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GT_U64
-
-    Inst_VOPC__V_CMPX_GT_U64::~Inst_VOPC__V_CMPX_GT_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NE_U64 class methods ---
-
-    Inst_VOPC__V_CMPX_NE_U64::Inst_VOPC__V_CMPX_NE_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ne_u64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NE_U64
-
-    Inst_VOPC__V_CMPX_NE_U64::~Inst_VOPC__V_CMPX_NE_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NE_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GE_U64 class methods ---
-
-    Inst_VOPC__V_CMPX_GE_U64::Inst_VOPC__V_CMPX_GE_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_u64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GE_U64
-
-    Inst_VOPC__V_CMPX_GE_U64::~Inst_VOPC__V_CMPX_GE_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_T_U64 class methods ---
-
-    Inst_VOPC__V_CMPX_T_U64::Inst_VOPC__V_CMPX_T_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_t_u64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_T_U64
-
-    Inst_VOPC__V_CMPX_T_U64::~Inst_VOPC__V_CMPX_T_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_T_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_T_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VINTRP__V_INTERP_P1_F32 class methods ---
-
-    Inst_VINTRP__V_INTERP_P1_F32::Inst_VINTRP__V_INTERP_P1_F32(
-          InFmt_VINTRP *iFmt)
-        : Inst_VINTRP(iFmt, "v_interp_p1_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VINTRP__V_INTERP_P1_F32
-
-    Inst_VINTRP__V_INTERP_P1_F32::~Inst_VINTRP__V_INTERP_P1_F32()
-    {
-    } // ~Inst_VINTRP__V_INTERP_P1_F32
-
-    // --- description from .arch file ---
-    // D.f = P10 * S.f + P0; parameter interpolation (SQ translates to
-    // V_MAD_F32 for SP).
-    // CAUTION: when in HALF_LDS mode, D must not be the same GPR as S;
-    // if D == S then data corruption will occur.
-    // NOTE: In textual representations the I/J VGPR is the first source and
-    // the attribute is the second source; however in the VOP3 encoding the
-    // attribute is stored in the src0 field and the VGPR is stored in the
-    // src1 field.
-    void
-    Inst_VINTRP__V_INTERP_P1_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VINTRP__V_INTERP_P2_F32 class methods ---
-
-    Inst_VINTRP__V_INTERP_P2_F32::Inst_VINTRP__V_INTERP_P2_F32(
-          InFmt_VINTRP *iFmt)
-        : Inst_VINTRP(iFmt, "v_interp_p2_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VINTRP__V_INTERP_P2_F32
-
-    Inst_VINTRP__V_INTERP_P2_F32::~Inst_VINTRP__V_INTERP_P2_F32()
-    {
-    } // ~Inst_VINTRP__V_INTERP_P2_F32
-
-    // --- description from .arch file ---
-    // D.f = P20 * S.f + D.f; parameter interpolation (SQ translates to
-    // V_MAD_F32 for SP).
-    // NOTE: In textual representations the I/J VGPR is the first source and
-    // the attribute is the second source; however in the VOP3 encoding the
-    // attribute is stored in the src0 field and the VGPR is stored in the
-    // src1 field.
-    void
-    Inst_VINTRP__V_INTERP_P2_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VINTRP__V_INTERP_MOV_F32 class methods ---
-
-    Inst_VINTRP__V_INTERP_MOV_F32::Inst_VINTRP__V_INTERP_MOV_F32(
-          InFmt_VINTRP *iFmt)
-        : Inst_VINTRP(iFmt, "v_interp_mov_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VINTRP__V_INTERP_MOV_F32
-
-    Inst_VINTRP__V_INTERP_MOV_F32::~Inst_VINTRP__V_INTERP_MOV_F32()
-    {
-    } // ~Inst_VINTRP__V_INTERP_MOV_F32
-
-    // --- description from .arch file ---
-    // D.f = {P10,P20,P0}[S.u]; parameter load.
-    void
-    Inst_VINTRP__V_INTERP_MOV_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_CLASS_F32 class methods ---
-
-    Inst_VOP3__V_CMP_CLASS_F32::Inst_VOP3__V_CMP_CLASS_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_class_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_CLASS_F32
-
-    Inst_VOP3__V_CMP_CLASS_F32::~Inst_VOP3__V_CMP_CLASS_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_CLASS_F32
-
-    // --- description from .arch file ---
-    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f
-    // The function reports true if the floating point value is *any* of the
-    // ---  numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOP3__V_CMP_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_CLASS_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_CLASS_F32::Inst_VOP3__V_CMPX_CLASS_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_class_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_CLASS_F32
-
-    Inst_VOP3__V_CMPX_CLASS_F32::~Inst_VOP3__V_CMPX_CLASS_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_CLASS_F32
-
-    // --- description from .arch file ---
-    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
-    // S0.f
-    // The function reports true if the floating point value is *any* of the
-    // numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOP3__V_CMPX_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_CLASS_F64 class methods ---
-
-    Inst_VOP3__V_CMP_CLASS_F64::Inst_VOP3__V_CMP_CLASS_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_class_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_CLASS_F64
-
-    Inst_VOP3__V_CMP_CLASS_F64::~Inst_VOP3__V_CMP_CLASS_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_CLASS_F64
-
-    // --- description from .arch file ---
-    // VCC = IEEE numeric class function specified in S1.u, performed on S0.d
-    // The function reports true if the floating point value is *any* of the
-    // ---  numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOP3__V_CMP_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_CLASS_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_CLASS_F64::Inst_VOP3__V_CMPX_CLASS_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_class_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_CLASS_F64
-
-    Inst_VOP3__V_CMPX_CLASS_F64::~Inst_VOP3__V_CMPX_CLASS_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_CLASS_F64
-
-    // --- description from .arch file ---
-    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
-    // S0.d
-    // The function reports true if the floating point value is *any* of the
-    // numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOP3__V_CMPX_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_CLASS_F16 class methods ---
-
-    Inst_VOP3__V_CMP_CLASS_F16::Inst_VOP3__V_CMP_CLASS_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_class_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_CLASS_F16
-
-    Inst_VOP3__V_CMP_CLASS_F16::~Inst_VOP3__V_CMP_CLASS_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_CLASS_F16
-
-    // --- description from .arch file ---
-    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f16
-    // The function reports true if the floating point value is *any* of the
-    // ---  numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOP3__V_CMP_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_CLASS_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_CLASS_F16::Inst_VOP3__V_CMPX_CLASS_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_class_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_CLASS_F16
-
-    Inst_VOP3__V_CMPX_CLASS_F16::~Inst_VOP3__V_CMPX_CLASS_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_CLASS_F16
-
-    // --- description from .arch file ---
-    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
-    // ---  S0.f16
-    // The function reports true if the floating point value is *any* of the
-    // ---  numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOP3__V_CMPX_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_F_F16 class methods ---
-
-    Inst_VOP3__V_CMP_F_F16::Inst_VOP3__V_CMP_F_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_f_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_F_F16
-
-    Inst_VOP3__V_CMP_F_F16::~Inst_VOP3__V_CMP_F_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_F_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LT_F16 class methods ---
-
-    Inst_VOP3__V_CMP_LT_F16::Inst_VOP3__V_CMP_LT_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_LT_F16
-
-    Inst_VOP3__V_CMP_LT_F16::~Inst_VOP3__V_CMP_LT_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_EQ_F16 class methods ---
-
-    Inst_VOP3__V_CMP_EQ_F16::Inst_VOP3__V_CMP_EQ_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_eq_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_EQ_F16
-
-    Inst_VOP3__V_CMP_EQ_F16::~Inst_VOP3__V_CMP_EQ_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LE_F16 class methods ---
-
-    Inst_VOP3__V_CMP_LE_F16::Inst_VOP3__V_CMP_LE_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_le_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_LE_F16
-
-    Inst_VOP3__V_CMP_LE_F16::~Inst_VOP3__V_CMP_LE_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GT_F16 class methods ---
-
-    Inst_VOP3__V_CMP_GT_F16::Inst_VOP3__V_CMP_GT_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_gt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_GT_F16
-
-    Inst_VOP3__V_CMP_GT_F16::~Inst_VOP3__V_CMP_GT_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LG_F16 class methods ---
-
-    Inst_VOP3__V_CMP_LG_F16::Inst_VOP3__V_CMP_LG_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lg_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_LG_F16
-
-    Inst_VOP3__V_CMP_LG_F16::~Inst_VOP3__V_CMP_LG_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_LG_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GE_F16 class methods ---
-
-    Inst_VOP3__V_CMP_GE_F16::Inst_VOP3__V_CMP_GE_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ge_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_GE_F16
-
-    Inst_VOP3__V_CMP_GE_F16::~Inst_VOP3__V_CMP_GE_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_O_F16 class methods ---
-
-    Inst_VOP3__V_CMP_O_F16::Inst_VOP3__V_CMP_O_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_o_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_O_F16
-
-    Inst_VOP3__V_CMP_O_F16::~Inst_VOP3__V_CMP_O_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_O_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_O_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_U_F16 class methods ---
-
-    Inst_VOP3__V_CMP_U_F16::Inst_VOP3__V_CMP_U_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_u_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_U_F16
-
-    Inst_VOP3__V_CMP_U_F16::~Inst_VOP3__V_CMP_U_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_U_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_U_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NGE_F16 class methods ---
-
-    Inst_VOP3__V_CMP_NGE_F16::Inst_VOP3__V_CMP_NGE_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nge_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_NGE_F16
-
-    Inst_VOP3__V_CMP_NGE_F16::~Inst_VOP3__V_CMP_NGE_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_NGE_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NLG_F16 class methods ---
-
-    Inst_VOP3__V_CMP_NLG_F16::Inst_VOP3__V_CMP_NLG_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nlg_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_NLG_F16
-
-    Inst_VOP3__V_CMP_NLG_F16::~Inst_VOP3__V_CMP_NLG_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_NLG_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NGT_F16 class methods ---
-
-    Inst_VOP3__V_CMP_NGT_F16::Inst_VOP3__V_CMP_NGT_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ngt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_NGT_F16
-
-    Inst_VOP3__V_CMP_NGT_F16::~Inst_VOP3__V_CMP_NGT_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_NGT_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NLE_F16 class methods ---
-
-    Inst_VOP3__V_CMP_NLE_F16::Inst_VOP3__V_CMP_NLE_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nle_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_NLE_F16
-
-    Inst_VOP3__V_CMP_NLE_F16::~Inst_VOP3__V_CMP_NLE_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_NLE_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NEQ_F16 class methods ---
-
-    Inst_VOP3__V_CMP_NEQ_F16::Inst_VOP3__V_CMP_NEQ_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_neq_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_NEQ_F16
-
-    Inst_VOP3__V_CMP_NEQ_F16::~Inst_VOP3__V_CMP_NEQ_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_NEQ_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NLT_F16 class methods ---
-
-    Inst_VOP3__V_CMP_NLT_F16::Inst_VOP3__V_CMP_NLT_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nlt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_NLT_F16
-
-    Inst_VOP3__V_CMP_NLT_F16::~Inst_VOP3__V_CMP_NLT_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_NLT_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_TRU_F16 class methods ---
-
-    Inst_VOP3__V_CMP_TRU_F16::Inst_VOP3__V_CMP_TRU_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_tru_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_TRU_F16
-
-    Inst_VOP3__V_CMP_TRU_F16::~Inst_VOP3__V_CMP_TRU_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_TRU_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_F_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_F_F16::Inst_VOP3__V_CMPX_F_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_f_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_F_F16
-
-    Inst_VOP3__V_CMPX_F_F16::~Inst_VOP3__V_CMPX_F_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LT_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_LT_F16::Inst_VOP3__V_CMPX_LT_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LT_F16
-
-    Inst_VOP3__V_CMPX_LT_F16::~Inst_VOP3__V_CMPX_LT_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_EQ_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_EQ_F16::Inst_VOP3__V_CMPX_EQ_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_eq_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_EQ_F16
-
-    Inst_VOP3__V_CMPX_EQ_F16::~Inst_VOP3__V_CMPX_EQ_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LE_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_LE_F16::Inst_VOP3__V_CMPX_LE_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_le_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LE_F16
-
-    Inst_VOP3__V_CMPX_LE_F16::~Inst_VOP3__V_CMPX_LE_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GT_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_GT_F16::Inst_VOP3__V_CMPX_GT_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_gt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GT_F16
-
-    Inst_VOP3__V_CMPX_GT_F16::~Inst_VOP3__V_CMPX_GT_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LG_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_LG_F16::Inst_VOP3__V_CMPX_LG_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lg_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LG_F16
-
-    Inst_VOP3__V_CMPX_LG_F16::~Inst_VOP3__V_CMPX_LG_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_LG_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GE_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_GE_F16::Inst_VOP3__V_CMPX_GE_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ge_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GE_F16
-
-    Inst_VOP3__V_CMPX_GE_F16::~Inst_VOP3__V_CMPX_GE_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_O_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_O_F16::Inst_VOP3__V_CMPX_O_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_o_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_O_F16
-
-    Inst_VOP3__V_CMPX_O_F16::~Inst_VOP3__V_CMPX_O_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_O_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOP3__V_CMPX_O_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_U_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_U_F16::Inst_VOP3__V_CMPX_U_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_u_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_U_F16
-
-    Inst_VOP3__V_CMPX_U_F16::~Inst_VOP3__V_CMPX_U_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_U_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOP3__V_CMPX_U_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NGE_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_NGE_F16::Inst_VOP3__V_CMPX_NGE_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nge_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NGE_F16
-
-    Inst_VOP3__V_CMPX_NGE_F16::~Inst_VOP3__V_CMPX_NGE_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NGE_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NLG_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_NLG_F16::Inst_VOP3__V_CMPX_NLG_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nlg_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NLG_F16
-
-    Inst_VOP3__V_CMPX_NLG_F16::~Inst_VOP3__V_CMPX_NLG_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLG_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NGT_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_NGT_F16::Inst_VOP3__V_CMPX_NGT_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ngt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NGT_F16
-
-    Inst_VOP3__V_CMPX_NGT_F16::~Inst_VOP3__V_CMPX_NGT_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NGT_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NLE_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_NLE_F16::Inst_VOP3__V_CMPX_NLE_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nle_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NLE_F16
-
-    Inst_VOP3__V_CMPX_NLE_F16::~Inst_VOP3__V_CMPX_NLE_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLE_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NEQ_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_NEQ_F16::Inst_VOP3__V_CMPX_NEQ_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_neq_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NEQ_F16
-
-    Inst_VOP3__V_CMPX_NEQ_F16::~Inst_VOP3__V_CMPX_NEQ_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NEQ_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NLT_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_NLT_F16::Inst_VOP3__V_CMPX_NLT_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nlt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NLT_F16
-
-    Inst_VOP3__V_CMPX_NLT_F16::~Inst_VOP3__V_CMPX_NLT_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLT_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_TRU_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_TRU_F16::Inst_VOP3__V_CMPX_TRU_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_tru_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_TRU_F16
-
-    Inst_VOP3__V_CMPX_TRU_F16::~Inst_VOP3__V_CMPX_TRU_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_TRU_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_F_F32 class methods ---
-
-    Inst_VOP3__V_CMP_F_F32::Inst_VOP3__V_CMP_F_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_f_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_F_F32
-
-    Inst_VOP3__V_CMP_F_F32::~Inst_VOP3__V_CMP_F_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_F_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LT_F32 class methods ---
-
-    Inst_VOP3__V_CMP_LT_F32::Inst_VOP3__V_CMP_LT_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_LT_F32
-
-    Inst_VOP3__V_CMP_LT_F32::~Inst_VOP3__V_CMP_LT_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_EQ_F32 class methods ---
-
-    Inst_VOP3__V_CMP_EQ_F32::Inst_VOP3__V_CMP_EQ_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_eq_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_EQ_F32
-
-    Inst_VOP3__V_CMP_EQ_F32::~Inst_VOP3__V_CMP_EQ_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LE_F32 class methods ---
-
-    Inst_VOP3__V_CMP_LE_F32::Inst_VOP3__V_CMP_LE_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_le_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_LE_F32
-
-    Inst_VOP3__V_CMP_LE_F32::~Inst_VOP3__V_CMP_LE_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GT_F32 class methods ---
-
-    Inst_VOP3__V_CMP_GT_F32::Inst_VOP3__V_CMP_GT_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_gt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_GT_F32
-
-    Inst_VOP3__V_CMP_GT_F32::~Inst_VOP3__V_CMP_GT_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LG_F32 class methods ---
-
-    Inst_VOP3__V_CMP_LG_F32::Inst_VOP3__V_CMP_LG_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lg_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_LG_F32
-
-    Inst_VOP3__V_CMP_LG_F32::~Inst_VOP3__V_CMP_LG_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_LG_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GE_F32 class methods ---
-
-    Inst_VOP3__V_CMP_GE_F32::Inst_VOP3__V_CMP_GE_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ge_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_GE_F32
-
-    Inst_VOP3__V_CMP_GE_F32::~Inst_VOP3__V_CMP_GE_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_O_F32 class methods ---
-
-    Inst_VOP3__V_CMP_O_F32::Inst_VOP3__V_CMP_O_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_o_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_O_F32
-
-    Inst_VOP3__V_CMP_O_F32::~Inst_VOP3__V_CMP_O_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_O_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_O_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_U_F32 class methods ---
-
-    Inst_VOP3__V_CMP_U_F32::Inst_VOP3__V_CMP_U_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_u_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_U_F32
-
-    Inst_VOP3__V_CMP_U_F32::~Inst_VOP3__V_CMP_U_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_U_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_U_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (std::isnan(src0[lane])
-                    || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NGE_F32 class methods ---
-
-    Inst_VOP3__V_CMP_NGE_F32::Inst_VOP3__V_CMP_NGE_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nge_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_NGE_F32
-
-    Inst_VOP3__V_CMP_NGE_F32::~Inst_VOP3__V_CMP_NGE_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_NGE_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NLG_F32 class methods ---
-
-    Inst_VOP3__V_CMP_NLG_F32::Inst_VOP3__V_CMP_NLG_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nlg_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_NLG_F32
-
-    Inst_VOP3__V_CMP_NLG_F32::~Inst_VOP3__V_CMP_NLG_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_NLG_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NGT_F32 class methods ---
-
-    Inst_VOP3__V_CMP_NGT_F32::Inst_VOP3__V_CMP_NGT_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ngt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_NGT_F32
-
-    Inst_VOP3__V_CMP_NGT_F32::~Inst_VOP3__V_CMP_NGT_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_NGT_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NLE_F32 class methods ---
-
-    Inst_VOP3__V_CMP_NLE_F32::Inst_VOP3__V_CMP_NLE_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nle_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_NLE_F32
-
-    Inst_VOP3__V_CMP_NLE_F32::~Inst_VOP3__V_CMP_NLE_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_NLE_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NEQ_F32 class methods ---
-
-    Inst_VOP3__V_CMP_NEQ_F32::Inst_VOP3__V_CMP_NEQ_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_neq_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_NEQ_F32
-
-    Inst_VOP3__V_CMP_NEQ_F32::~Inst_VOP3__V_CMP_NEQ_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_NEQ_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NLT_F32 class methods ---
-
-    Inst_VOP3__V_CMP_NLT_F32::Inst_VOP3__V_CMP_NLT_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nlt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_NLT_F32
-
-    Inst_VOP3__V_CMP_NLT_F32::~Inst_VOP3__V_CMP_NLT_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_NLT_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_TRU_F32 class methods ---
-
-    Inst_VOP3__V_CMP_TRU_F32::Inst_VOP3__V_CMP_TRU_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_tru_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_TRU_F32
-
-    Inst_VOP3__V_CMP_TRU_F32::~Inst_VOP3__V_CMP_TRU_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_TRU_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_F_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_F_F32::Inst_VOP3__V_CMPX_F_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_f_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_F_F32
-
-    Inst_VOP3__V_CMPX_F_F32::~Inst_VOP3__V_CMPX_F_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LT_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_LT_F32::Inst_VOP3__V_CMPX_LT_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LT_F32
-
-    Inst_VOP3__V_CMPX_LT_F32::~Inst_VOP3__V_CMPX_LT_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_EQ_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_EQ_F32::Inst_VOP3__V_CMPX_EQ_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_eq_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_EQ_F32
-
-    Inst_VOP3__V_CMPX_EQ_F32::~Inst_VOP3__V_CMPX_EQ_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LE_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_LE_F32::Inst_VOP3__V_CMPX_LE_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_le_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LE_F32
-
-    Inst_VOP3__V_CMPX_LE_F32::~Inst_VOP3__V_CMPX_LE_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GT_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_GT_F32::Inst_VOP3__V_CMPX_GT_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_gt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GT_F32
-
-    Inst_VOP3__V_CMPX_GT_F32::~Inst_VOP3__V_CMPX_GT_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LG_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_LG_F32::Inst_VOP3__V_CMPX_LG_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lg_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LG_F32
-
-    Inst_VOP3__V_CMPX_LG_F32::~Inst_VOP3__V_CMPX_LG_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_LG_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GE_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_GE_F32::Inst_VOP3__V_CMPX_GE_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ge_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GE_F32
-
-    Inst_VOP3__V_CMPX_GE_F32::~Inst_VOP3__V_CMPX_GE_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_O_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_O_F32::Inst_VOP3__V_CMPX_O_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_o_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_O_F32
-
-    Inst_VOP3__V_CMPX_O_F32::~Inst_VOP3__V_CMPX_O_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_O_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOP3__V_CMPX_O_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_U_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_U_F32::Inst_VOP3__V_CMPX_U_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_u_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_U_F32
-
-    Inst_VOP3__V_CMPX_U_F32::~Inst_VOP3__V_CMPX_U_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_U_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOP3__V_CMPX_U_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (std::isnan(src0[lane])
-                        || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NGE_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_NGE_F32::Inst_VOP3__V_CMPX_NGE_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nge_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NGE_F32
-
-    Inst_VOP3__V_CMPX_NGE_F32::~Inst_VOP3__V_CMPX_NGE_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NGE_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NLG_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_NLG_F32::Inst_VOP3__V_CMPX_NLG_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nlg_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NLG_F32
-
-    Inst_VOP3__V_CMPX_NLG_F32::~Inst_VOP3__V_CMPX_NLG_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLG_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NGT_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_NGT_F32::Inst_VOP3__V_CMPX_NGT_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ngt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NGT_F32
-
-    Inst_VOP3__V_CMPX_NGT_F32::~Inst_VOP3__V_CMPX_NGT_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NGT_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NLE_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_NLE_F32::Inst_VOP3__V_CMPX_NLE_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nle_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NLE_F32
-
-    Inst_VOP3__V_CMPX_NLE_F32::~Inst_VOP3__V_CMPX_NLE_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLE_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NEQ_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_NEQ_F32::Inst_VOP3__V_CMPX_NEQ_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_neq_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NEQ_F32
-
-    Inst_VOP3__V_CMPX_NEQ_F32::~Inst_VOP3__V_CMPX_NEQ_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NEQ_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NLT_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_NLT_F32::Inst_VOP3__V_CMPX_NLT_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nlt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NLT_F32
-
-    Inst_VOP3__V_CMPX_NLT_F32::~Inst_VOP3__V_CMPX_NLT_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLT_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_TRU_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_TRU_F32::Inst_VOP3__V_CMPX_TRU_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_tru_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_TRU_F32
-
-    Inst_VOP3__V_CMPX_TRU_F32::~Inst_VOP3__V_CMPX_TRU_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_TRU_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_F_F64 class methods ---
-
-    Inst_VOP3__V_CMP_F_F64::Inst_VOP3__V_CMP_F_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_f_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_F_F64
-
-    Inst_VOP3__V_CMP_F_F64::~Inst_VOP3__V_CMP_F_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_F_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LT_F64 class methods ---
-
-    Inst_VOP3__V_CMP_LT_F64::Inst_VOP3__V_CMP_LT_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_LT_F64
-
-    Inst_VOP3__V_CMP_LT_F64::~Inst_VOP3__V_CMP_LT_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_EQ_F64 class methods ---
-
-    Inst_VOP3__V_CMP_EQ_F64::Inst_VOP3__V_CMP_EQ_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_eq_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_EQ_F64
-
-    Inst_VOP3__V_CMP_EQ_F64::~Inst_VOP3__V_CMP_EQ_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LE_F64 class methods ---
-
-    Inst_VOP3__V_CMP_LE_F64::Inst_VOP3__V_CMP_LE_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_le_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_LE_F64
-
-    Inst_VOP3__V_CMP_LE_F64::~Inst_VOP3__V_CMP_LE_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GT_F64 class methods ---
-
-    Inst_VOP3__V_CMP_GT_F64::Inst_VOP3__V_CMP_GT_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_gt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_GT_F64
-
-    Inst_VOP3__V_CMP_GT_F64::~Inst_VOP3__V_CMP_GT_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LG_F64 class methods ---
-
-    Inst_VOP3__V_CMP_LG_F64::Inst_VOP3__V_CMP_LG_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lg_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_LG_F64
-
-    Inst_VOP3__V_CMP_LG_F64::~Inst_VOP3__V_CMP_LG_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_LG_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GE_F64 class methods ---
-
-    Inst_VOP3__V_CMP_GE_F64::Inst_VOP3__V_CMP_GE_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ge_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_GE_F64
-
-    Inst_VOP3__V_CMP_GE_F64::~Inst_VOP3__V_CMP_GE_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_O_F64 class methods ---
-
-    Inst_VOP3__V_CMP_O_F64::Inst_VOP3__V_CMP_O_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_o_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_O_F64
-
-    Inst_VOP3__V_CMP_O_F64::~Inst_VOP3__V_CMP_O_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_O_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_O_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_U_F64 class methods ---
-
-    Inst_VOP3__V_CMP_U_F64::Inst_VOP3__V_CMP_U_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_u_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_U_F64
-
-    Inst_VOP3__V_CMP_U_F64::~Inst_VOP3__V_CMP_U_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_U_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_U_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (std::isnan(src0[lane])
-                    || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NGE_F64 class methods ---
-
-    Inst_VOP3__V_CMP_NGE_F64::Inst_VOP3__V_CMP_NGE_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nge_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_NGE_F64
-
-    Inst_VOP3__V_CMP_NGE_F64::~Inst_VOP3__V_CMP_NGE_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_NGE_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NLG_F64 class methods ---
-
-    Inst_VOP3__V_CMP_NLG_F64::Inst_VOP3__V_CMP_NLG_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nlg_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_NLG_F64
-
-    Inst_VOP3__V_CMP_NLG_F64::~Inst_VOP3__V_CMP_NLG_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_NLG_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NGT_F64 class methods ---
-
-    Inst_VOP3__V_CMP_NGT_F64::Inst_VOP3__V_CMP_NGT_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ngt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_NGT_F64
-
-    Inst_VOP3__V_CMP_NGT_F64::~Inst_VOP3__V_CMP_NGT_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_NGT_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NLE_F64 class methods ---
-
-    Inst_VOP3__V_CMP_NLE_F64::Inst_VOP3__V_CMP_NLE_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nle_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_NLE_F64
-
-    Inst_VOP3__V_CMP_NLE_F64::~Inst_VOP3__V_CMP_NLE_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_NLE_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NEQ_F64 class methods ---
-
-    Inst_VOP3__V_CMP_NEQ_F64::Inst_VOP3__V_CMP_NEQ_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_neq_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_NEQ_F64
-
-    Inst_VOP3__V_CMP_NEQ_F64::~Inst_VOP3__V_CMP_NEQ_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_NEQ_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NLT_F64 class methods ---
-
-    Inst_VOP3__V_CMP_NLT_F64::Inst_VOP3__V_CMP_NLT_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nlt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_NLT_F64
-
-    Inst_VOP3__V_CMP_NLT_F64::~Inst_VOP3__V_CMP_NLT_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_NLT_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_TRU_F64 class methods ---
-
-    Inst_VOP3__V_CMP_TRU_F64::Inst_VOP3__V_CMP_TRU_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_tru_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_TRU_F64
-
-    Inst_VOP3__V_CMP_TRU_F64::~Inst_VOP3__V_CMP_TRU_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_TRU_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_F_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_F_F64::Inst_VOP3__V_CMPX_F_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_f_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_F_F64
-
-    Inst_VOP3__V_CMPX_F_F64::~Inst_VOP3__V_CMPX_F_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LT_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_LT_F64::Inst_VOP3__V_CMPX_LT_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LT_F64
-
-    Inst_VOP3__V_CMPX_LT_F64::~Inst_VOP3__V_CMPX_LT_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_EQ_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_EQ_F64::Inst_VOP3__V_CMPX_EQ_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_eq_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_EQ_F64
-
-    Inst_VOP3__V_CMPX_EQ_F64::~Inst_VOP3__V_CMPX_EQ_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LE_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_LE_F64::Inst_VOP3__V_CMPX_LE_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_le_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LE_F64
-
-    Inst_VOP3__V_CMPX_LE_F64::~Inst_VOP3__V_CMPX_LE_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GT_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_GT_F64::Inst_VOP3__V_CMPX_GT_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_gt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GT_F64
-
-    Inst_VOP3__V_CMPX_GT_F64::~Inst_VOP3__V_CMPX_GT_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LG_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_LG_F64::Inst_VOP3__V_CMPX_LG_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lg_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LG_F64
-
-    Inst_VOP3__V_CMPX_LG_F64::~Inst_VOP3__V_CMPX_LG_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_LG_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GE_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_GE_F64::Inst_VOP3__V_CMPX_GE_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ge_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GE_F64
-
-    Inst_VOP3__V_CMPX_GE_F64::~Inst_VOP3__V_CMPX_GE_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_O_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_O_F64::Inst_VOP3__V_CMPX_O_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_o_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_O_F64
-
-    Inst_VOP3__V_CMPX_O_F64::~Inst_VOP3__V_CMPX_O_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_O_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOP3__V_CMPX_O_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_U_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_U_F64::Inst_VOP3__V_CMPX_U_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_u_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_U_F64
-
-    Inst_VOP3__V_CMPX_U_F64::~Inst_VOP3__V_CMPX_U_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_U_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOP3__V_CMPX_U_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (std::isnan(src0[lane])
-                    || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NGE_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_NGE_F64::Inst_VOP3__V_CMPX_NGE_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nge_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NGE_F64
-
-    Inst_VOP3__V_CMPX_NGE_F64::~Inst_VOP3__V_CMPX_NGE_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NGE_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NLG_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_NLG_F64::Inst_VOP3__V_CMPX_NLG_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nlg_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NLG_F64
-
-    Inst_VOP3__V_CMPX_NLG_F64::~Inst_VOP3__V_CMPX_NLG_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLG_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NGT_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_NGT_F64::Inst_VOP3__V_CMPX_NGT_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ngt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NGT_F64
-
-    Inst_VOP3__V_CMPX_NGT_F64::~Inst_VOP3__V_CMPX_NGT_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NGT_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NLE_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_NLE_F64::Inst_VOP3__V_CMPX_NLE_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nle_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NLE_F64
-
-    Inst_VOP3__V_CMPX_NLE_F64::~Inst_VOP3__V_CMPX_NLE_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLE_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NEQ_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_NEQ_F64::Inst_VOP3__V_CMPX_NEQ_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_neq_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NEQ_F64
-
-    Inst_VOP3__V_CMPX_NEQ_F64::~Inst_VOP3__V_CMPX_NEQ_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NEQ_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NLT_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_NLT_F64::Inst_VOP3__V_CMPX_NLT_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nlt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NLT_F64
-
-    Inst_VOP3__V_CMPX_NLT_F64::~Inst_VOP3__V_CMPX_NLT_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLT_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_TRU_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_TRU_F64::Inst_VOP3__V_CMPX_TRU_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_tru_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_TRU_F64
-
-    Inst_VOP3__V_CMPX_TRU_F64::~Inst_VOP3__V_CMPX_TRU_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_TRU_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_F_I16 class methods ---
-
-    Inst_VOP3__V_CMP_F_I16::Inst_VOP3__V_CMP_F_I16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_f_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_F_I16
-
-    Inst_VOP3__V_CMP_F_I16::~Inst_VOP3__V_CMP_F_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_F_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LT_I16 class methods ---
-
-    Inst_VOP3__V_CMP_LT_I16::Inst_VOP3__V_CMP_LT_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lt_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LT_I16
-
-    Inst_VOP3__V_CMP_LT_I16::~Inst_VOP3__V_CMP_LT_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_EQ_I16 class methods ---
-
-    Inst_VOP3__V_CMP_EQ_I16::Inst_VOP3__V_CMP_EQ_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_eq_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_EQ_I16
-
-    Inst_VOP3__V_CMP_EQ_I16::~Inst_VOP3__V_CMP_EQ_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LE_I16 class methods ---
-
-    Inst_VOP3__V_CMP_LE_I16::Inst_VOP3__V_CMP_LE_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_le_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LE_I16
-
-    Inst_VOP3__V_CMP_LE_I16::~Inst_VOP3__V_CMP_LE_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GT_I16 class methods ---
-
-    Inst_VOP3__V_CMP_GT_I16::Inst_VOP3__V_CMP_GT_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_gt_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GT_I16
-
-    Inst_VOP3__V_CMP_GT_I16::~Inst_VOP3__V_CMP_GT_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NE_I16 class methods ---
-
-    Inst_VOP3__V_CMP_NE_I16::Inst_VOP3__V_CMP_NE_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ne_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_NE_I16
-
-    Inst_VOP3__V_CMP_NE_I16::~Inst_VOP3__V_CMP_NE_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_NE_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GE_I16 class methods ---
-
-    Inst_VOP3__V_CMP_GE_I16::Inst_VOP3__V_CMP_GE_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ge_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GE_I16
-
-    Inst_VOP3__V_CMP_GE_I16::~Inst_VOP3__V_CMP_GE_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_T_I16 class methods ---
-
-    Inst_VOP3__V_CMP_T_I16::Inst_VOP3__V_CMP_T_I16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_t_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_T_I16
-
-    Inst_VOP3__V_CMP_T_I16::~Inst_VOP3__V_CMP_T_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_T_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_T_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_F_U16 class methods ---
-
-    Inst_VOP3__V_CMP_F_U16::Inst_VOP3__V_CMP_F_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_f_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_F_U16
-
-    Inst_VOP3__V_CMP_F_U16::~Inst_VOP3__V_CMP_F_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_F_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LT_U16 class methods ---
-
-    Inst_VOP3__V_CMP_LT_U16::Inst_VOP3__V_CMP_LT_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lt_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LT_U16
-
-    Inst_VOP3__V_CMP_LT_U16::~Inst_VOP3__V_CMP_LT_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_EQ_U16 class methods ---
-
-    Inst_VOP3__V_CMP_EQ_U16::Inst_VOP3__V_CMP_EQ_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_eq_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_EQ_U16
-
-    Inst_VOP3__V_CMP_EQ_U16::~Inst_VOP3__V_CMP_EQ_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LE_U16 class methods ---
-
-    Inst_VOP3__V_CMP_LE_U16::Inst_VOP3__V_CMP_LE_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_le_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LE_U16
-
-    Inst_VOP3__V_CMP_LE_U16::~Inst_VOP3__V_CMP_LE_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GT_U16 class methods ---
-
-    Inst_VOP3__V_CMP_GT_U16::Inst_VOP3__V_CMP_GT_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_gt_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GT_U16
-
-    Inst_VOP3__V_CMP_GT_U16::~Inst_VOP3__V_CMP_GT_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NE_U16 class methods ---
-
-    Inst_VOP3__V_CMP_NE_U16::Inst_VOP3__V_CMP_NE_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ne_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_NE_U16
-
-    Inst_VOP3__V_CMP_NE_U16::~Inst_VOP3__V_CMP_NE_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_NE_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GE_U16 class methods ---
-
-    Inst_VOP3__V_CMP_GE_U16::Inst_VOP3__V_CMP_GE_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ge_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GE_U16
-
-    Inst_VOP3__V_CMP_GE_U16::~Inst_VOP3__V_CMP_GE_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_T_U16 class methods ---
-
-    Inst_VOP3__V_CMP_T_U16::Inst_VOP3__V_CMP_T_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_t_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_T_U16
-
-    Inst_VOP3__V_CMP_T_U16::~Inst_VOP3__V_CMP_T_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_T_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_T_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_F_I16 class methods ---
-
-    Inst_VOP3__V_CMPX_F_I16::Inst_VOP3__V_CMPX_F_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_f_i16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_F_I16
-
-    Inst_VOP3__V_CMPX_F_I16::~Inst_VOP3__V_CMPX_F_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LT_I16 class methods ---
-
-    Inst_VOP3__V_CMPX_LT_I16::Inst_VOP3__V_CMPX_LT_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lt_i16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LT_I16
-
-    Inst_VOP3__V_CMPX_LT_I16::~Inst_VOP3__V_CMPX_LT_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_EQ_I16 class methods ---
-
-    Inst_VOP3__V_CMPX_EQ_I16::Inst_VOP3__V_CMPX_EQ_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_eq_i16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_EQ_I16
-
-    Inst_VOP3__V_CMPX_EQ_I16::~Inst_VOP3__V_CMPX_EQ_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LE_I16 class methods ---
-
-    Inst_VOP3__V_CMPX_LE_I16::Inst_VOP3__V_CMPX_LE_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_le_i16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LE_I16
-
-    Inst_VOP3__V_CMPX_LE_I16::~Inst_VOP3__V_CMPX_LE_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GT_I16 class methods ---
-
-    Inst_VOP3__V_CMPX_GT_I16::Inst_VOP3__V_CMPX_GT_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_gt_i16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GT_I16
-
-    Inst_VOP3__V_CMPX_GT_I16::~Inst_VOP3__V_CMPX_GT_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NE_I16 class methods ---
-
-    Inst_VOP3__V_CMPX_NE_I16::Inst_VOP3__V_CMPX_NE_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ne_i16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NE_I16
-
-    Inst_VOP3__V_CMPX_NE_I16::~Inst_VOP3__V_CMPX_NE_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NE_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GE_I16 class methods ---
-
-    Inst_VOP3__V_CMPX_GE_I16::Inst_VOP3__V_CMPX_GE_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ge_i16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GE_I16
-
-    Inst_VOP3__V_CMPX_GE_I16::~Inst_VOP3__V_CMPX_GE_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_T_I16 class methods ---
-
-    Inst_VOP3__V_CMPX_T_I16::Inst_VOP3__V_CMPX_T_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_t_i16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_T_I16
-
-    Inst_VOP3__V_CMPX_T_I16::~Inst_VOP3__V_CMPX_T_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_T_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_T_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_F_U16 class methods ---
-
-    Inst_VOP3__V_CMPX_F_U16::Inst_VOP3__V_CMPX_F_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_f_u16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_F_U16
-
-    Inst_VOP3__V_CMPX_F_U16::~Inst_VOP3__V_CMPX_F_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LT_U16 class methods ---
-
-    Inst_VOP3__V_CMPX_LT_U16::Inst_VOP3__V_CMPX_LT_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lt_u16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LT_U16
-
-    Inst_VOP3__V_CMPX_LT_U16::~Inst_VOP3__V_CMPX_LT_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_EQ_U16 class methods ---
-
-    Inst_VOP3__V_CMPX_EQ_U16::Inst_VOP3__V_CMPX_EQ_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_eq_u16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_EQ_U16
-
-    Inst_VOP3__V_CMPX_EQ_U16::~Inst_VOP3__V_CMPX_EQ_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LE_U16 class methods ---
-
-    Inst_VOP3__V_CMPX_LE_U16::Inst_VOP3__V_CMPX_LE_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_le_u16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LE_U16
-
-    Inst_VOP3__V_CMPX_LE_U16::~Inst_VOP3__V_CMPX_LE_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GT_U16 class methods ---
-
-    Inst_VOP3__V_CMPX_GT_U16::Inst_VOP3__V_CMPX_GT_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_gt_u16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GT_U16
-
-    Inst_VOP3__V_CMPX_GT_U16::~Inst_VOP3__V_CMPX_GT_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NE_U16 class methods ---
-
-    Inst_VOP3__V_CMPX_NE_U16::Inst_VOP3__V_CMPX_NE_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ne_u16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NE_U16
-
-    Inst_VOP3__V_CMPX_NE_U16::~Inst_VOP3__V_CMPX_NE_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NE_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GE_U16 class methods ---
-
-    Inst_VOP3__V_CMPX_GE_U16::Inst_VOP3__V_CMPX_GE_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ge_u16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GE_U16
-
-    Inst_VOP3__V_CMPX_GE_U16::~Inst_VOP3__V_CMPX_GE_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_T_U16 class methods ---
-
-    Inst_VOP3__V_CMPX_T_U16::Inst_VOP3__V_CMPX_T_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_t_u16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_T_U16
-
-    Inst_VOP3__V_CMPX_T_U16::~Inst_VOP3__V_CMPX_T_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_T_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_T_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_F_I32 class methods ---
-
-    Inst_VOP3__V_CMP_F_I32::Inst_VOP3__V_CMP_F_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_f_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_F_I32
-
-    Inst_VOP3__V_CMP_F_I32::~Inst_VOP3__V_CMP_F_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_F_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LT_I32 class methods ---
-
-    Inst_VOP3__V_CMP_LT_I32::Inst_VOP3__V_CMP_LT_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lt_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LT_I32
-
-    Inst_VOP3__V_CMP_LT_I32::~Inst_VOP3__V_CMP_LT_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_EQ_I32 class methods ---
-
-    Inst_VOP3__V_CMP_EQ_I32::Inst_VOP3__V_CMP_EQ_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_eq_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_EQ_I32
-
-    Inst_VOP3__V_CMP_EQ_I32::~Inst_VOP3__V_CMP_EQ_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LE_I32 class methods ---
-
-    Inst_VOP3__V_CMP_LE_I32::Inst_VOP3__V_CMP_LE_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_le_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LE_I32
-
-    Inst_VOP3__V_CMP_LE_I32::~Inst_VOP3__V_CMP_LE_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GT_I32 class methods ---
-
-    Inst_VOP3__V_CMP_GT_I32::Inst_VOP3__V_CMP_GT_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_gt_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GT_I32
-
-    Inst_VOP3__V_CMP_GT_I32::~Inst_VOP3__V_CMP_GT_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NE_I32 class methods ---
-
-    Inst_VOP3__V_CMP_NE_I32::Inst_VOP3__V_CMP_NE_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ne_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_NE_I32
-
-    Inst_VOP3__V_CMP_NE_I32::~Inst_VOP3__V_CMP_NE_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_NE_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GE_I32 class methods ---
-
-    Inst_VOP3__V_CMP_GE_I32::Inst_VOP3__V_CMP_GE_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ge_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GE_I32
-
-    Inst_VOP3__V_CMP_GE_I32::~Inst_VOP3__V_CMP_GE_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_T_I32 class methods ---
-
-    Inst_VOP3__V_CMP_T_I32::Inst_VOP3__V_CMP_T_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_t_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_T_I32
-
-    Inst_VOP3__V_CMP_T_I32::~Inst_VOP3__V_CMP_T_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_T_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_T_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_F_U32 class methods ---
-
-    Inst_VOP3__V_CMP_F_U32::Inst_VOP3__V_CMP_F_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_f_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_F_U32
-
-    Inst_VOP3__V_CMP_F_U32::~Inst_VOP3__V_CMP_F_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_F_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LT_U32 class methods ---
-
-    Inst_VOP3__V_CMP_LT_U32::Inst_VOP3__V_CMP_LT_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lt_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LT_U32
-
-    Inst_VOP3__V_CMP_LT_U32::~Inst_VOP3__V_CMP_LT_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_EQ_U32 class methods ---
-
-    Inst_VOP3__V_CMP_EQ_U32::Inst_VOP3__V_CMP_EQ_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_eq_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_EQ_U32
-
-    Inst_VOP3__V_CMP_EQ_U32::~Inst_VOP3__V_CMP_EQ_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LE_U32 class methods ---
-
-    Inst_VOP3__V_CMP_LE_U32::Inst_VOP3__V_CMP_LE_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_le_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LE_U32
-
-    Inst_VOP3__V_CMP_LE_U32::~Inst_VOP3__V_CMP_LE_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GT_U32 class methods ---
-
-    Inst_VOP3__V_CMP_GT_U32::Inst_VOP3__V_CMP_GT_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_gt_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GT_U32
-
-    Inst_VOP3__V_CMP_GT_U32::~Inst_VOP3__V_CMP_GT_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NE_U32 class methods ---
-
-    Inst_VOP3__V_CMP_NE_U32::Inst_VOP3__V_CMP_NE_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ne_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_NE_U32
-
-    Inst_VOP3__V_CMP_NE_U32::~Inst_VOP3__V_CMP_NE_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_NE_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GE_U32 class methods ---
-
-    Inst_VOP3__V_CMP_GE_U32::Inst_VOP3__V_CMP_GE_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ge_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GE_U32
-
-    Inst_VOP3__V_CMP_GE_U32::~Inst_VOP3__V_CMP_GE_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_T_U32 class methods ---
-
-    Inst_VOP3__V_CMP_T_U32::Inst_VOP3__V_CMP_T_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_t_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_T_U32
-
-    Inst_VOP3__V_CMP_T_U32::~Inst_VOP3__V_CMP_T_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_T_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_T_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_F_I32 class methods ---
-
-    Inst_VOP3__V_CMPX_F_I32::Inst_VOP3__V_CMPX_F_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_f_i32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_F_I32
-
-    Inst_VOP3__V_CMPX_F_I32::~Inst_VOP3__V_CMPX_F_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LT_I32 class methods ---
-
-    Inst_VOP3__V_CMPX_LT_I32::Inst_VOP3__V_CMPX_LT_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lt_i32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LT_I32
-
-    Inst_VOP3__V_CMPX_LT_I32::~Inst_VOP3__V_CMPX_LT_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_EQ_I32 class methods ---
-
-    Inst_VOP3__V_CMPX_EQ_I32::Inst_VOP3__V_CMPX_EQ_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_eq_i32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_EQ_I32
-
-    Inst_VOP3__V_CMPX_EQ_I32::~Inst_VOP3__V_CMPX_EQ_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LE_I32 class methods ---
-
-    Inst_VOP3__V_CMPX_LE_I32::Inst_VOP3__V_CMPX_LE_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_le_i32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LE_I32
-
-    Inst_VOP3__V_CMPX_LE_I32::~Inst_VOP3__V_CMPX_LE_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GT_I32 class methods ---
-
-    Inst_VOP3__V_CMPX_GT_I32::Inst_VOP3__V_CMPX_GT_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_gt_i32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GT_I32
-
-    Inst_VOP3__V_CMPX_GT_I32::~Inst_VOP3__V_CMPX_GT_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NE_I32 class methods ---
-
-    Inst_VOP3__V_CMPX_NE_I32::Inst_VOP3__V_CMPX_NE_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ne_i32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NE_I32
-
-    Inst_VOP3__V_CMPX_NE_I32::~Inst_VOP3__V_CMPX_NE_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NE_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GE_I32 class methods ---
-
-    Inst_VOP3__V_CMPX_GE_I32::Inst_VOP3__V_CMPX_GE_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ge_i32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GE_I32
-
-    Inst_VOP3__V_CMPX_GE_I32::~Inst_VOP3__V_CMPX_GE_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_T_I32 class methods ---
-
-    Inst_VOP3__V_CMPX_T_I32::Inst_VOP3__V_CMPX_T_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_t_i32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_T_I32
-
-    Inst_VOP3__V_CMPX_T_I32::~Inst_VOP3__V_CMPX_T_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_T_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_T_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_F_U32 class methods ---
-
-    Inst_VOP3__V_CMPX_F_U32::Inst_VOP3__V_CMPX_F_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_f_u32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_F_U32
-
-    Inst_VOP3__V_CMPX_F_U32::~Inst_VOP3__V_CMPX_F_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LT_U32 class methods ---
-
-    Inst_VOP3__V_CMPX_LT_U32::Inst_VOP3__V_CMPX_LT_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lt_u32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LT_U32
-
-    Inst_VOP3__V_CMPX_LT_U32::~Inst_VOP3__V_CMPX_LT_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_EQ_U32 class methods ---
-
-    Inst_VOP3__V_CMPX_EQ_U32::Inst_VOP3__V_CMPX_EQ_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_eq_u32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_EQ_U32
-
-    Inst_VOP3__V_CMPX_EQ_U32::~Inst_VOP3__V_CMPX_EQ_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LE_U32 class methods ---
-
-    Inst_VOP3__V_CMPX_LE_U32::Inst_VOP3__V_CMPX_LE_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_le_u32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LE_U32
-
-    Inst_VOP3__V_CMPX_LE_U32::~Inst_VOP3__V_CMPX_LE_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GT_U32 class methods ---
-
-    Inst_VOP3__V_CMPX_GT_U32::Inst_VOP3__V_CMPX_GT_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_gt_u32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GT_U32
-
-    Inst_VOP3__V_CMPX_GT_U32::~Inst_VOP3__V_CMPX_GT_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NE_U32 class methods ---
-
-    Inst_VOP3__V_CMPX_NE_U32::Inst_VOP3__V_CMPX_NE_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ne_u32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NE_U32
-
-    Inst_VOP3__V_CMPX_NE_U32::~Inst_VOP3__V_CMPX_NE_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NE_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GE_U32 class methods ---
-
-    Inst_VOP3__V_CMPX_GE_U32::Inst_VOP3__V_CMPX_GE_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ge_u32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GE_U32
-
-    Inst_VOP3__V_CMPX_GE_U32::~Inst_VOP3__V_CMPX_GE_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_T_U32 class methods ---
-
-    Inst_VOP3__V_CMPX_T_U32::Inst_VOP3__V_CMPX_T_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_t_u32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_T_U32
-
-    Inst_VOP3__V_CMPX_T_U32::~Inst_VOP3__V_CMPX_T_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_T_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_T_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_F_I64 class methods ---
-
-    Inst_VOP3__V_CMP_F_I64::Inst_VOP3__V_CMP_F_I64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_f_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_F_I64
-
-    Inst_VOP3__V_CMP_F_I64::~Inst_VOP3__V_CMP_F_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_F_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LT_I64 class methods ---
-
-    Inst_VOP3__V_CMP_LT_I64::Inst_VOP3__V_CMP_LT_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lt_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LT_I64
-
-    Inst_VOP3__V_CMP_LT_I64::~Inst_VOP3__V_CMP_LT_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_EQ_I64 class methods ---
-
-    Inst_VOP3__V_CMP_EQ_I64::Inst_VOP3__V_CMP_EQ_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_eq_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_EQ_I64
-
-    Inst_VOP3__V_CMP_EQ_I64::~Inst_VOP3__V_CMP_EQ_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LE_I64 class methods ---
-
-    Inst_VOP3__V_CMP_LE_I64::Inst_VOP3__V_CMP_LE_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_le_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LE_I64
-
-    Inst_VOP3__V_CMP_LE_I64::~Inst_VOP3__V_CMP_LE_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GT_I64 class methods ---
-
-    Inst_VOP3__V_CMP_GT_I64::Inst_VOP3__V_CMP_GT_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_gt_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GT_I64
-
-    Inst_VOP3__V_CMP_GT_I64::~Inst_VOP3__V_CMP_GT_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NE_I64 class methods ---
-
-    Inst_VOP3__V_CMP_NE_I64::Inst_VOP3__V_CMP_NE_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ne_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_NE_I64
-
-    Inst_VOP3__V_CMP_NE_I64::~Inst_VOP3__V_CMP_NE_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_NE_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GE_I64 class methods ---
-
-    Inst_VOP3__V_CMP_GE_I64::Inst_VOP3__V_CMP_GE_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ge_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GE_I64
-
-    Inst_VOP3__V_CMP_GE_I64::~Inst_VOP3__V_CMP_GE_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_T_I64 class methods ---
-
-    Inst_VOP3__V_CMP_T_I64::Inst_VOP3__V_CMP_T_I64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_t_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_T_I64
-
-    Inst_VOP3__V_CMP_T_I64::~Inst_VOP3__V_CMP_T_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_T_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_T_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_F_U64 class methods ---
-
-    Inst_VOP3__V_CMP_F_U64::Inst_VOP3__V_CMP_F_U64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_f_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_F_U64
-
-    Inst_VOP3__V_CMP_F_U64::~Inst_VOP3__V_CMP_F_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_F_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LT_U64 class methods ---
-
-    Inst_VOP3__V_CMP_LT_U64::Inst_VOP3__V_CMP_LT_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lt_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LT_U64
-
-    Inst_VOP3__V_CMP_LT_U64::~Inst_VOP3__V_CMP_LT_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_EQ_U64 class methods ---
-
-    Inst_VOP3__V_CMP_EQ_U64::Inst_VOP3__V_CMP_EQ_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_eq_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_EQ_U64
-
-    Inst_VOP3__V_CMP_EQ_U64::~Inst_VOP3__V_CMP_EQ_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LE_U64 class methods ---
-
-    Inst_VOP3__V_CMP_LE_U64::Inst_VOP3__V_CMP_LE_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_le_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LE_U64
-
-    Inst_VOP3__V_CMP_LE_U64::~Inst_VOP3__V_CMP_LE_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GT_U64 class methods ---
-
-    Inst_VOP3__V_CMP_GT_U64::Inst_VOP3__V_CMP_GT_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_gt_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GT_U64
-
-    Inst_VOP3__V_CMP_GT_U64::~Inst_VOP3__V_CMP_GT_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NE_U64 class methods ---
-
-    Inst_VOP3__V_CMP_NE_U64::Inst_VOP3__V_CMP_NE_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ne_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_NE_U64
-
-    Inst_VOP3__V_CMP_NE_U64::~Inst_VOP3__V_CMP_NE_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_NE_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GE_U64 class methods ---
-
-    Inst_VOP3__V_CMP_GE_U64::Inst_VOP3__V_CMP_GE_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ge_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GE_U64
-
-    Inst_VOP3__V_CMP_GE_U64::~Inst_VOP3__V_CMP_GE_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_T_U64 class methods ---
-
-    Inst_VOP3__V_CMP_T_U64::Inst_VOP3__V_CMP_T_U64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_t_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_T_U64
-
-    Inst_VOP3__V_CMP_T_U64::~Inst_VOP3__V_CMP_T_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_T_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_T_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_F_I64 class methods ---
-
-    Inst_VOP3__V_CMPX_F_I64::Inst_VOP3__V_CMPX_F_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_f_i64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_F_I64
-
-    Inst_VOP3__V_CMPX_F_I64::~Inst_VOP3__V_CMPX_F_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LT_I64 class methods ---
-
-    Inst_VOP3__V_CMPX_LT_I64::Inst_VOP3__V_CMPX_LT_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lt_i64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LT_I64
-
-    Inst_VOP3__V_CMPX_LT_I64::~Inst_VOP3__V_CMPX_LT_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_EQ_I64 class methods ---
-
-    Inst_VOP3__V_CMPX_EQ_I64::Inst_VOP3__V_CMPX_EQ_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_eq_i64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_EQ_I64
-
-    Inst_VOP3__V_CMPX_EQ_I64::~Inst_VOP3__V_CMPX_EQ_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LE_I64 class methods ---
-
-    Inst_VOP3__V_CMPX_LE_I64::Inst_VOP3__V_CMPX_LE_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_le_i64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LE_I64
-
-    Inst_VOP3__V_CMPX_LE_I64::~Inst_VOP3__V_CMPX_LE_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GT_I64 class methods ---
-
-    Inst_VOP3__V_CMPX_GT_I64::Inst_VOP3__V_CMPX_GT_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_gt_i64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GT_I64
-
-    Inst_VOP3__V_CMPX_GT_I64::~Inst_VOP3__V_CMPX_GT_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NE_I64 class methods ---
-
-    Inst_VOP3__V_CMPX_NE_I64::Inst_VOP3__V_CMPX_NE_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ne_i64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NE_I64
-
-    Inst_VOP3__V_CMPX_NE_I64::~Inst_VOP3__V_CMPX_NE_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NE_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GE_I64 class methods ---
-
-    Inst_VOP3__V_CMPX_GE_I64::Inst_VOP3__V_CMPX_GE_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ge_i64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GE_I64
-
-    Inst_VOP3__V_CMPX_GE_I64::~Inst_VOP3__V_CMPX_GE_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_T_I64 class methods ---
-
-    Inst_VOP3__V_CMPX_T_I64::Inst_VOP3__V_CMPX_T_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_t_i64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_T_I64
-
-    Inst_VOP3__V_CMPX_T_I64::~Inst_VOP3__V_CMPX_T_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_T_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_T_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_F_U64 class methods ---
-
-    Inst_VOP3__V_CMPX_F_U64::Inst_VOP3__V_CMPX_F_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_f_u64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_F_U64
-
-    Inst_VOP3__V_CMPX_F_U64::~Inst_VOP3__V_CMPX_F_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LT_U64 class methods ---
-
-    Inst_VOP3__V_CMPX_LT_U64::Inst_VOP3__V_CMPX_LT_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lt_u64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LT_U64
-
-    Inst_VOP3__V_CMPX_LT_U64::~Inst_VOP3__V_CMPX_LT_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_EQ_U64 class methods ---
-
-    Inst_VOP3__V_CMPX_EQ_U64::Inst_VOP3__V_CMPX_EQ_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_eq_u64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_EQ_U64
-
-    Inst_VOP3__V_CMPX_EQ_U64::~Inst_VOP3__V_CMPX_EQ_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LE_U64 class methods ---
-
-    Inst_VOP3__V_CMPX_LE_U64::Inst_VOP3__V_CMPX_LE_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_le_u64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LE_U64
-
-    Inst_VOP3__V_CMPX_LE_U64::~Inst_VOP3__V_CMPX_LE_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GT_U64 class methods ---
-
-    Inst_VOP3__V_CMPX_GT_U64::Inst_VOP3__V_CMPX_GT_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_gt_u64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GT_U64
-
-    Inst_VOP3__V_CMPX_GT_U64::~Inst_VOP3__V_CMPX_GT_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NE_U64 class methods ---
-
-    Inst_VOP3__V_CMPX_NE_U64::Inst_VOP3__V_CMPX_NE_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ne_u64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NE_U64
-
-    Inst_VOP3__V_CMPX_NE_U64::~Inst_VOP3__V_CMPX_NE_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NE_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GE_U64 class methods ---
-
-    Inst_VOP3__V_CMPX_GE_U64::Inst_VOP3__V_CMPX_GE_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ge_u64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GE_U64
-
-    Inst_VOP3__V_CMPX_GE_U64::~Inst_VOP3__V_CMPX_GE_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_T_U64 class methods ---
-
-    Inst_VOP3__V_CMPX_T_U64::Inst_VOP3__V_CMPX_T_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_t_u64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_T_U64
-
-    Inst_VOP3__V_CMPX_T_U64::~Inst_VOP3__V_CMPX_T_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_T_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_T_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CNDMASK_B32 class methods ---
-
-    Inst_VOP3__V_CNDMASK_B32::Inst_VOP3__V_CNDMASK_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cndmask_b32", false)
-    {
-        setFlag(ALU);
-        setFlag(ReadsVCC);
-    } // Inst_VOP3__V_CNDMASK_B32
-
-    Inst_VOP3__V_CNDMASK_B32::~Inst_VOP3__V_CNDMASK_B32()
-    {
-    } // ~Inst_VOP3__V_CNDMASK_B32
-
-    // --- description from .arch file ---
-    // D.u = (VCC[i] ? S1.u : S0.u) (i = threadID in wave); VOP3: specify VCC
-    // as a scalar GPR in S2.
-    void
-    Inst_VOP3__V_CNDMASK_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        vcc.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = bits(vcc.rawData(), lane)
-                    ? src1[lane] : src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ADD_F32 class methods ---
-
-    Inst_VOP3__V_ADD_F32::Inst_VOP3__V_ADD_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_add_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_ADD_F32
-
-    Inst_VOP3__V_ADD_F32::~Inst_VOP3__V_ADD_F32()
-    {
-    } // ~Inst_VOP3__V_ADD_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f + S1.f.
-    void
-    Inst_VOP3__V_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SUB_F32 class methods ---
-
-    Inst_VOP3__V_SUB_F32::Inst_VOP3__V_SUB_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sub_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_SUB_F32
-
-    Inst_VOP3__V_SUB_F32::~Inst_VOP3__V_SUB_F32()
-    {
-    } // ~Inst_VOP3__V_SUB_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f - S1.f.
-    // SQ translates to V_ADD_F32.
-    void
-    Inst_VOP3__V_SUB_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SUBREV_F32 class methods ---
-
-    Inst_VOP3__V_SUBREV_F32::Inst_VOP3__V_SUBREV_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_subrev_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_SUBREV_F32
-
-    Inst_VOP3__V_SUBREV_F32::~Inst_VOP3__V_SUBREV_F32()
-    {
-    } // ~Inst_VOP3__V_SUBREV_F32
-
-    // --- description from .arch file ---
-    // D.f = S1.f - S0.f.
-    // SQ translates to V_ADD_F32.
-    void
-    Inst_VOP3__V_SUBREV_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_LEGACY_F32 class methods ---
-
-    Inst_VOP3__V_MUL_LEGACY_F32::Inst_VOP3__V_MUL_LEGACY_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_legacy_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_MUL_LEGACY_F32
-
-    Inst_VOP3__V_MUL_LEGACY_F32::~Inst_VOP3__V_MUL_LEGACY_F32()
-    {
-    } // ~Inst_VOP3__V_MUL_LEGACY_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f (DX9 rules, 0.0*x = 0.0).
-    void
-    Inst_VOP3__V_MUL_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isnan(src0[lane]) ||
-                    std::isnan(src1[lane])) {
-                    vdst[lane] = NAN;
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else {
-                    vdst[lane] = src0[lane] * src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_F32 class methods ---
-
-    Inst_VOP3__V_MUL_F32::Inst_VOP3__V_MUL_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_MUL_F32
-
-    Inst_VOP3__V_MUL_F32::~Inst_VOP3__V_MUL_F32()
-    {
-    } // ~Inst_VOP3__V_MUL_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f.
-    void
-    Inst_VOP3__V_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isnan(src0[lane]) ||
-                    std::isnan(src1[lane])) {
-                    vdst[lane] = NAN;
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else {
-                    vdst[lane] = src0[lane] * src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_I32_I24 class methods ---
-
-    Inst_VOP3__V_MUL_I32_I24::Inst_VOP3__V_MUL_I32_I24(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_i32_i24", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_I32_I24
-
-    Inst_VOP3__V_MUL_I32_I24::~Inst_VOP3__V_MUL_I32_I24()
-    {
-    } // ~Inst_VOP3__V_MUL_I32_I24
-
-    // --- description from .arch file ---
-    // D.i = S0.i[23:0] * S1.i[23:0].
-    void
-    Inst_VOP3__V_MUL_I32_I24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
-                    * sext<24>(bits(src1[lane], 23, 0));
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_HI_I32_I24 class methods ---
-
-    Inst_VOP3__V_MUL_HI_I32_I24::Inst_VOP3__V_MUL_HI_I32_I24(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_hi_i32_i24", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_HI_I32_I24
-
-    Inst_VOP3__V_MUL_HI_I32_I24::~Inst_VOP3__V_MUL_HI_I32_I24()
-    {
-    } // ~Inst_VOP3__V_MUL_HI_I32_I24
-
-    // --- description from .arch file ---
-    // D.i = (S0.i[23:0] * S1.i[23:0])>>32.
-    void
-    Inst_VOP3__V_MUL_HI_I32_I24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI64 tmp_src0
-                    = (VecElemI64)sext<24>(bits(src0[lane], 23, 0));
-                VecElemI64 tmp_src1
-                    = (VecElemI64)sext<24>(bits(src1[lane], 23, 0));
-
-                vdst[lane] = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_U32_U24 class methods ---
-
-    Inst_VOP3__V_MUL_U32_U24::Inst_VOP3__V_MUL_U32_U24(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_u32_u24", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_U32_U24
-
-    Inst_VOP3__V_MUL_U32_U24::~Inst_VOP3__V_MUL_U32_U24()
-    {
-    } // ~Inst_VOP3__V_MUL_U32_U24
-
-    // --- description from .arch file ---
-    // D.u = S0.u[23:0] * S1.u[23:0].
-    void
-    Inst_VOP3__V_MUL_U32_U24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_HI_U32_U24 class methods ---
-
-    Inst_VOP3__V_MUL_HI_U32_U24::Inst_VOP3__V_MUL_HI_U32_U24(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_hi_u32_u24", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_HI_U32_U24
-
-    Inst_VOP3__V_MUL_HI_U32_U24::~Inst_VOP3__V_MUL_HI_U32_U24()
-    {
-    } // ~Inst_VOP3__V_MUL_HI_U32_U24
-
-    // --- description from .arch file ---
-    // D.i = (S0.u[23:0] * S1.u[23:0])>>32.
-    void
-    Inst_VOP3__V_MUL_HI_U32_U24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemU64 tmp_src0 = (VecElemU64)bits(src0[lane], 23, 0);
-                VecElemU64 tmp_src1 = (VecElemU64)bits(src1[lane], 23, 0);
-                vdst[lane] = (VecElemU32)((tmp_src0 * tmp_src1) >> 32);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MIN_F32 class methods ---
-
-    Inst_VOP3__V_MIN_F32::Inst_VOP3__V_MIN_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_MIN_F32
-
-    Inst_VOP3__V_MIN_F32::~Inst_VOP3__V_MIN_F32()
-    {
-    } // ~Inst_VOP3__V_MIN_F32
-
-    // --- description from .arch file ---
-    // D.f = (S0.f < S1.f ? S0.f : S1.f).
-    void
-    Inst_VOP3__V_MIN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fmin(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAX_F32 class methods ---
-
-    Inst_VOP3__V_MAX_F32::Inst_VOP3__V_MAX_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_MAX_F32
-
-    Inst_VOP3__V_MAX_F32::~Inst_VOP3__V_MAX_F32()
-    {
-    } // ~Inst_VOP3__V_MAX_F32
-
-    // --- description from .arch file ---
-    // D.f = (S0.f >= S1.f ? S0.f : S1.f).
-    void
-    Inst_VOP3__V_MAX_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fmax(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MIN_I32 class methods ---
-
-    Inst_VOP3__V_MIN_I32::Inst_VOP3__V_MIN_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MIN_I32
-
-    Inst_VOP3__V_MIN_I32::~Inst_VOP3__V_MIN_I32()
-    {
-    } // ~Inst_VOP3__V_MIN_I32
-
-    // --- description from .arch file ---
-    // D.i = min(S0.i, S1.i).
-    void
-    Inst_VOP3__V_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAX_I32 class methods ---
-
-    Inst_VOP3__V_MAX_I32::Inst_VOP3__V_MAX_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MAX_I32
-
-    Inst_VOP3__V_MAX_I32::~Inst_VOP3__V_MAX_I32()
-    {
-    } // ~Inst_VOP3__V_MAX_I32
-
-    // --- description from .arch file ---
-    // D.i = max(S0.i, S1.i).
-    void
-    Inst_VOP3__V_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MIN_U32 class methods ---
-
-    Inst_VOP3__V_MIN_U32::Inst_VOP3__V_MIN_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MIN_U32
-
-    Inst_VOP3__V_MIN_U32::~Inst_VOP3__V_MIN_U32()
-    {
-    } // ~Inst_VOP3__V_MIN_U32
-
-    // --- description from .arch file ---
-    // D.u = min(S0.u, S1.u).
-    void
-    Inst_VOP3__V_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAX_U32 class methods ---
-
-    Inst_VOP3__V_MAX_U32::Inst_VOP3__V_MAX_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MAX_U32
-
-    Inst_VOP3__V_MAX_U32::~Inst_VOP3__V_MAX_U32()
-    {
-    } // ~Inst_VOP3__V_MAX_U32
-
-    // --- description from .arch file ---
-    // D.u = max(S0.u, S1.u).
-    void
-    Inst_VOP3__V_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LSHRREV_B32 class methods ---
-
-    Inst_VOP3__V_LSHRREV_B32::Inst_VOP3__V_LSHRREV_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_lshrrev_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHRREV_B32
-
-    Inst_VOP3__V_LSHRREV_B32::~Inst_VOP3__V_LSHRREV_B32()
-    {
-    } // ~Inst_VOP3__V_LSHRREV_B32
-
-    // --- description from .arch file ---
-    // D.u = S1.u >> S0.u[4:0].
-    // The vacated bits are set to zero.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP3__V_LSHRREV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ASHRREV_I32 class methods ---
-
-    Inst_VOP3__V_ASHRREV_I32::Inst_VOP3__V_ASHRREV_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ashrrev_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ASHRREV_I32
-
-    Inst_VOP3__V_ASHRREV_I32::~Inst_VOP3__V_ASHRREV_I32()
-    {
-    } // ~Inst_VOP3__V_ASHRREV_I32
-
-    // --- description from .arch file ---
-    // D.i = signext(S1.i) >> S0.i[4:0].
-    // The vacated bits are set to the sign bit of the input value.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP3__V_ASHRREV_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LSHLREV_B32 class methods ---
-
-    Inst_VOP3__V_LSHLREV_B32::Inst_VOP3__V_LSHLREV_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_lshlrev_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHLREV_B32
-
-    Inst_VOP3__V_LSHLREV_B32::~Inst_VOP3__V_LSHLREV_B32()
-    {
-    } // ~Inst_VOP3__V_LSHLREV_B32
-
-    // --- description from .arch file ---
-    // D.u = S1.u << S0.u[4:0].
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP3__V_LSHLREV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_AND_B32 class methods ---
-
-    Inst_VOP3__V_AND_B32::Inst_VOP3__V_AND_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_and_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_AND_B32
-
-    Inst_VOP3__V_AND_B32::~Inst_VOP3__V_AND_B32()
-    {
-    } // ~Inst_VOP3__V_AND_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u & S1.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP3__V_AND_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] & src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_OR_B32 class methods ---
-
-    Inst_VOP3__V_OR_B32::Inst_VOP3__V_OR_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_or_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_OR_B32
-
-    Inst_VOP3__V_OR_B32::~Inst_VOP3__V_OR_B32()
-    {
-    } // ~Inst_VOP3__V_OR_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u | S1.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP3__V_OR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] | src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_OR3_B32 class methods ---
-
-    Inst_VOP3__V_OR3_B32::Inst_VOP3__V_OR3_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_or3_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_OR3_B32
-
-    Inst_VOP3__V_OR3_B32::~Inst_VOP3__V_OR3_B32()
-    {
-    } // ~Inst_VOP3__V_OR3_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u | S1.u | S2.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP3__V_OR3_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] | src1[lane] | src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_XOR_B32 class methods ---
-
-    Inst_VOP3__V_XOR_B32::Inst_VOP3__V_XOR_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_xor_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_XOR_B32
-
-    Inst_VOP3__V_XOR_B32::~Inst_VOP3__V_XOR_B32()
-    {
-    } // ~Inst_VOP3__V_XOR_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u ^ S1.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP3__V_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] ^ src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAC_F32 class methods ---
-
-    Inst_VOP3__V_MAC_F32::Inst_VOP3__V_MAC_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mac_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(MAC);
-    } // Inst_VOP3__V_MAC_F32
-
-    Inst_VOP3__V_MAC_F32::~Inst_VOP3__V_MAC_F32()
-    {
-    } // ~Inst_VOP3__V_MAC_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f + D.f.
-    // SQ translates to V_MAD_F32.
-    void
-    Inst_VOP3__V_MAC_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        vdst.read();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ADD_CO_U32 class methods ---
-
-    Inst_VOP3__V_ADD_CO_U32::Inst_VOP3__V_ADD_CO_U32(InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_add_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-    } // Inst_VOP3__V_ADD_CO_U32
-
-    Inst_VOP3__V_ADD_CO_U32::~Inst_VOP3__V_ADD_CO_U32()
-    {
-    } // ~Inst_VOP3__V_ADD_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u + S1.u;
-    // VCC[threadId] = (S0.u + S1.u >= 0x800000000ULL ? 1 : 0) is an UNSIGNED
-    // ---  overflow or carry-out for V_ADDC_U32.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
-    void
-    Inst_VOP3__V_ADD_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane];
-                vcc.setBit(lane, ((VecElemU64)src0[lane]
-                    + (VecElemU64)src1[lane]) >= 0x100000000ULL ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    } // execute
-    // --- Inst_VOP3__V_SUB_CO_U32 class methods ---
-
-    Inst_VOP3__V_SUB_CO_U32::Inst_VOP3__V_SUB_CO_U32(InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_sub_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-    } // Inst_VOP3__V_SUB_CO_U32
-
-    Inst_VOP3__V_SUB_CO_U32::~Inst_VOP3__V_SUB_CO_U32()
-    {
-    } // ~Inst_VOP3__V_SUB_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u - S1.u;
-    // VCC[threadId] = (S1.u > S0.u ? 1 : 0) is an UNSIGNED overflow or
-    // carry-out for V_SUBB_U32.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
-    void
-    Inst_VOP3__V_SUB_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-                vcc.setBit(lane, src1[lane] > src0[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    } // execute
-    // --- Inst_VOP3__V_SUBREV_CO_U32 class methods ---
-
-    Inst_VOP3__V_SUBREV_CO_U32::Inst_VOP3__V_SUBREV_CO_U32(
-          InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_subrev_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-    } // Inst_VOP3__V_SUBREV_CO_U32
-
-    Inst_VOP3__V_SUBREV_CO_U32::~Inst_VOP3__V_SUBREV_CO_U32()
-    {
-    } // ~Inst_VOP3__V_SUBREV_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S1.u - S0.u;
-    // VCC[threadId] = (S0.u > S1.u ? 1 : 0) is an UNSIGNED overflow or
-    // carry-out for V_SUBB_U32.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
-    // SQ translates this to V_SUB_U32 with reversed operands.
-    void
-    Inst_VOP3__V_SUBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    } // execute
-    // --- Inst_VOP3__V_ADDC_CO_U32 class methods ---
-
-    Inst_VOP3__V_ADDC_CO_U32::Inst_VOP3__V_ADDC_CO_U32(InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_addc_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(ReadsVCC);
-    } // Inst_VOP3__V_ADDC_CO_U32
-
-    Inst_VOP3__V_ADDC_CO_U32::~Inst_VOP3__V_ADDC_CO_U32()
-    {
-    } // ~Inst_VOP3__V_ADDC_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u + S1.u + VCC[threadId];
-    // VCC[threadId] = (S0.u + S1.u + VCC[threadId] >= 0x800000000ULL ? 1 : 0)
-    // is an UNSIGNED overflow.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
-    // source comes from the SGPR-pair at S2.u.
-    void
-    Inst_VOP3__V_ADDC_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        vcc.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane]
-                    + bits(vcc.rawData(), lane);
-                sdst.setBit(lane, ((VecElemU64)src0[lane]
-                    + (VecElemU64)src1[lane]
-                        + (VecElemU64)bits(vcc.rawData(), lane))
-                            >= 0x100000000 ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SUBB_CO_U32 class methods ---
-
-    Inst_VOP3__V_SUBB_CO_U32::Inst_VOP3__V_SUBB_CO_U32(InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_subb_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(ReadsVCC);
-    } // Inst_VOP3__V_SUBB_CO_U32
-
-    Inst_VOP3__V_SUBB_CO_U32::~Inst_VOP3__V_SUBB_CO_U32()
-    {
-    } // ~Inst_VOP3__V_SUBB_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u - S1.u - VCC[threadId];
-    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
-    // ---  overflow.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
-    // ---  source comes from the SGPR-pair at S2.u.
-    void
-    Inst_VOP3__V_SUBB_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        vcc.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane]
-                    - bits(vcc.rawData(), lane);
-                sdst.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
-                    > src0[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SUBBREV_CO_U32 class methods ---
-
-    Inst_VOP3__V_SUBBREV_CO_U32::Inst_VOP3__V_SUBBREV_CO_U32(
-          InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_subbrev_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(ReadsVCC);
-    } // Inst_VOP3__V_SUBBREV_CO_U32
-
-    Inst_VOP3__V_SUBBREV_CO_U32::~Inst_VOP3__V_SUBBREV_CO_U32()
-    {
-    } // ~Inst_VOP3__V_SUBBREV_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S1.u - S0.u - VCC[threadId];
-    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
-    // overflow.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
-    // source comes from the SGPR-pair at S2.u. SQ translates to V_SUBB_U32.
-    void
-    Inst_VOP3__V_SUBBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        vcc.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane]
-                    - bits(vcc.rawData(), lane);
-                sdst.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
-                    > src0[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ADD_F16 class methods ---
-
-    Inst_VOP3__V_ADD_F16::Inst_VOP3__V_ADD_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_add_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_ADD_F16
-
-    Inst_VOP3__V_ADD_F16::~Inst_VOP3__V_ADD_F16()
-    {
-    } // ~Inst_VOP3__V_ADD_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 + S1.f16.
-    // Supports denormals, round mode, exception flags, saturation.
-    void
-    Inst_VOP3__V_ADD_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_SUB_F16 class methods ---
-
-    Inst_VOP3__V_SUB_F16::Inst_VOP3__V_SUB_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sub_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_SUB_F16
-
-    Inst_VOP3__V_SUB_F16::~Inst_VOP3__V_SUB_F16()
-    {
-    } // ~Inst_VOP3__V_SUB_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 - S1.f16.
-    // Supports denormals, round mode, exception flags, saturation.
-    // SQ translates to V_ADD_F16.
-    void
-    Inst_VOP3__V_SUB_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_SUBREV_F16 class methods ---
-
-    Inst_VOP3__V_SUBREV_F16::Inst_VOP3__V_SUBREV_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_subrev_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_SUBREV_F16
-
-    Inst_VOP3__V_SUBREV_F16::~Inst_VOP3__V_SUBREV_F16()
-    {
-    } // ~Inst_VOP3__V_SUBREV_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S1.f16 - S0.f16.
-    // Supports denormals, round mode, exception flags, saturation.
-    // SQ translates to V_ADD_F16.
-    void
-    Inst_VOP3__V_SUBREV_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_MUL_F16 class methods ---
-
-    Inst_VOP3__V_MUL_F16::Inst_VOP3__V_MUL_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_MUL_F16
-
-    Inst_VOP3__V_MUL_F16::~Inst_VOP3__V_MUL_F16()
-    {
-    } // ~Inst_VOP3__V_MUL_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * S1.f16.
-    // Supports denormals, round mode, exception flags, saturation.
-    void
-    Inst_VOP3__V_MUL_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_MAC_F16 class methods ---
-
-    Inst_VOP3__V_MAC_F16::Inst_VOP3__V_MAC_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mac_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(MAC);
-    } // Inst_VOP3__V_MAC_F16
-
-    Inst_VOP3__V_MAC_F16::~Inst_VOP3__V_MAC_F16()
-    {
-    } // ~Inst_VOP3__V_MAC_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * S1.f16 + D.f16.
-    // Supports round mode, exception flags, saturation.
-    // SQ translates this to V_MAD_F16.
-    void
-    Inst_VOP3__V_MAC_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_ADD_U16 class methods ---
-
-    Inst_VOP3__V_ADD_U16::Inst_VOP3__V_ADD_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_add_u16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ADD_U16
-
-    Inst_VOP3__V_ADD_U16::~Inst_VOP3__V_ADD_U16()
-    {
-    } // ~Inst_VOP3__V_ADD_U16
-
-    // --- description from .arch file ---
-    // D.u16 = S0.u16 + S1.u16.
-    // Supports saturation (unsigned 16-bit integer domain).
-    void
-    Inst_VOP3__V_ADD_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SUB_U16 class methods ---
-
-    Inst_VOP3__V_SUB_U16::Inst_VOP3__V_SUB_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sub_u16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SUB_U16
-
-    Inst_VOP3__V_SUB_U16::~Inst_VOP3__V_SUB_U16()
-    {
-    } // ~Inst_VOP3__V_SUB_U16
-
-    // --- description from .arch file ---
-    // D.u16 = S0.u16 - S1.u16.
-    // Supports saturation (unsigned 16-bit integer domain).
-    void
-    Inst_VOP3__V_SUB_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SUBREV_U16 class methods ---
-
-    Inst_VOP3__V_SUBREV_U16::Inst_VOP3__V_SUBREV_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_subrev_u16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SUBREV_U16
-
-    Inst_VOP3__V_SUBREV_U16::~Inst_VOP3__V_SUBREV_U16()
-    {
-    } // ~Inst_VOP3__V_SUBREV_U16
-
-    // --- description from .arch file ---
-    // D.u16 = S1.u16 - S0.u16.
-    // Supports saturation (unsigned 16-bit integer domain).
-    // SQ translates this to V_SUB_U16 with reversed operands.
-    void
-    Inst_VOP3__V_SUBREV_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_LO_U16 class methods ---
-
-    Inst_VOP3__V_MUL_LO_U16::Inst_VOP3__V_MUL_LO_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_lo_u16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_LO_U16
-
-    Inst_VOP3__V_MUL_LO_U16::~Inst_VOP3__V_MUL_LO_U16()
-    {
-    } // ~Inst_VOP3__V_MUL_LO_U16
-
-    // --- description from .arch file ---
-    // D.u16 = S0.u16 * S1.u16.
-    // Supports saturation (unsigned 16-bit integer domain).
-    void
-    Inst_VOP3__V_MUL_LO_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] * src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LSHLREV_B16 class methods ---
-
-    Inst_VOP3__V_LSHLREV_B16::Inst_VOP3__V_LSHLREV_B16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_lshlrev_b16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHLREV_B16
-
-    Inst_VOP3__V_LSHLREV_B16::~Inst_VOP3__V_LSHLREV_B16()
-    {
-    } // ~Inst_VOP3__V_LSHLREV_B16
-
-    // --- description from .arch file ---
-    // D.u[15:0] = S1.u[15:0] << S0.u[3:0].
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP3__V_LSHLREV_B16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] << bits(src0[lane], 3, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LSHRREV_B16 class methods ---
-
-    Inst_VOP3__V_LSHRREV_B16::Inst_VOP3__V_LSHRREV_B16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_lshrrev_b16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHRREV_B16
-
-    Inst_VOP3__V_LSHRREV_B16::~Inst_VOP3__V_LSHRREV_B16()
-    {
-    } // ~Inst_VOP3__V_LSHRREV_B16
-
-    // --- description from .arch file ---
-    // D.u[15:0] = S1.u[15:0] >> S0.u[3:0].
-    // The vacated bits are set to zero.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP3__V_LSHRREV_B16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> bits(src0[lane], 3, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ASHRREV_I16 class methods ---
-
-    Inst_VOP3__V_ASHRREV_I16::Inst_VOP3__V_ASHRREV_I16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ashrrev_i16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ASHRREV_I16
-
-    Inst_VOP3__V_ASHRREV_I16::~Inst_VOP3__V_ASHRREV_I16()
-    {
-    } // ~Inst_VOP3__V_ASHRREV_I16
-
-    // --- description from .arch file ---
-    // D.i[15:0] = signext(S1.i[15:0]) >> S0.i[3:0].
-    // The vacated bits are set to the sign bit of the input value.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP3__V_ASHRREV_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        VecOperandI16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> bits(src0[lane], 3, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAX_F16 class methods ---
-
-    Inst_VOP3__V_MAX_F16::Inst_VOP3__V_MAX_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_MAX_F16
-
-    Inst_VOP3__V_MAX_F16::~Inst_VOP3__V_MAX_F16()
-    {
-    } // ~Inst_VOP3__V_MAX_F16
-
-    // --- description from .arch file ---
-    // D.f16 = max(S0.f16, S1.f16).
-    // IEEE compliant. Supports denormals, round mode, exception flags,
-    // saturation.
-    void
-    Inst_VOP3__V_MAX_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_MIN_F16 class methods ---
-
-    Inst_VOP3__V_MIN_F16::Inst_VOP3__V_MIN_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_MIN_F16
-
-    Inst_VOP3__V_MIN_F16::~Inst_VOP3__V_MIN_F16()
-    {
-    } // ~Inst_VOP3__V_MIN_F16
-
-    // --- description from .arch file ---
-    // D.f16 = min(S0.f16, S1.f16).
-    // IEEE compliant. Supports denormals, round mode, exception flags,
-    // saturation.
-    void
-    Inst_VOP3__V_MIN_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_MAX_U16 class methods ---
-
-    Inst_VOP3__V_MAX_U16::Inst_VOP3__V_MAX_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max_u16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MAX_U16
-
-    Inst_VOP3__V_MAX_U16::~Inst_VOP3__V_MAX_U16()
-    {
-    } // ~Inst_VOP3__V_MAX_U16
-
-    // --- description from .arch file ---
-    // D.u[15:0] = max(S0.u[15:0], S1.u[15:0]).
-    void
-    Inst_VOP3__V_MAX_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAX_I16 class methods ---
-
-    Inst_VOP3__V_MAX_I16::Inst_VOP3__V_MAX_I16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max_i16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MAX_I16
-
-    Inst_VOP3__V_MAX_I16::~Inst_VOP3__V_MAX_I16()
-    {
-    } // ~Inst_VOP3__V_MAX_I16
-
-    // --- description from .arch file ---
-    // D.i[15:0] = max(S0.i[15:0], S1.i[15:0]).
-    void
-    Inst_VOP3__V_MAX_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        VecOperandI16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MIN_U16 class methods ---
-
-    Inst_VOP3__V_MIN_U16::Inst_VOP3__V_MIN_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min_u16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MIN_U16
-
-    Inst_VOP3__V_MIN_U16::~Inst_VOP3__V_MIN_U16()
-    {
-    } // ~Inst_VOP3__V_MIN_U16
-
-    // --- description from .arch file ---
-    // D.u[15:0] = min(S0.u[15:0], S1.u[15:0]).
-    void
-    Inst_VOP3__V_MIN_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MIN_I16 class methods ---
-
-    Inst_VOP3__V_MIN_I16::Inst_VOP3__V_MIN_I16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min_i16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MIN_I16
-
-    Inst_VOP3__V_MIN_I16::~Inst_VOP3__V_MIN_I16()
-    {
-    } // ~Inst_VOP3__V_MIN_I16
-
-    // --- description from .arch file ---
-    // D.i[15:0] = min(S0.i[15:0], S1.i[15:0]).
-    void
-    Inst_VOP3__V_MIN_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        VecOperandI16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LDEXP_F16 class methods ---
-
-    Inst_VOP3__V_LDEXP_F16::Inst_VOP3__V_LDEXP_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ldexp_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_LDEXP_F16
-
-    Inst_VOP3__V_LDEXP_F16::~Inst_VOP3__V_LDEXP_F16()
-    {
-    } // ~Inst_VOP3__V_LDEXP_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * (2 ** S1.i16).
-    void
-    Inst_VOP3__V_LDEXP_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_ADD_U32 class methods ---
-
-    Inst_VOP3__V_ADD_U32::Inst_VOP3__V_ADD_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_add_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ADD_U32
-
-    Inst_VOP3__V_ADD_U32::~Inst_VOP3__V_ADD_U32()
-    {
-    } // ~Inst_VOP3__V_ADD_U32
-
-    // --- description from .arch file ---
-    // D.u32 = S0.u32 + S1.u32.
-    void
-    Inst_VOP3__V_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SUB_U32 class methods ---
-
-    Inst_VOP3__V_SUB_U32::Inst_VOP3__V_SUB_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sub_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SUB_U32
-
-    Inst_VOP3__V_SUB_U32::~Inst_VOP3__V_SUB_U32()
-    {
-    } // ~Inst_VOP3__V_SUB_U32
-
-    // --- description from .arch file ---
-    // D.u32 = S0.u32 - S1.u32.
-    void
-    Inst_VOP3__V_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SUBREV_U32 class methods ---
-
-    Inst_VOP3__V_SUBREV_U32::Inst_VOP3__V_SUBREV_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_subrev_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SUBREV_U32
-
-    Inst_VOP3__V_SUBREV_U32::~Inst_VOP3__V_SUBREV_U32()
-    {
-    } // ~Inst_VOP3__V_SUBREV_U32
-
-    // --- description from .arch file ---
-    // D.u32 = S1.u32 - S0.u32.
-    void
-    Inst_VOP3__V_SUBREV_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_NOP class methods ---
-
-    Inst_VOP3__V_NOP::Inst_VOP3__V_NOP(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_nop", false)
-    {
-        setFlag(Nop);
-        setFlag(ALU);
-    } // Inst_VOP3__V_NOP
-
-    Inst_VOP3__V_NOP::~Inst_VOP3__V_NOP()
-    {
-    } // ~Inst_VOP3__V_NOP
-
-    // --- description from .arch file ---
-    // Do nothing.
-    void
-    Inst_VOP3__V_NOP::execute(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_VOP3__V_MOV_B32 class methods ---
-
-    Inst_VOP3__V_MOV_B32::Inst_VOP3__V_MOV_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mov_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MOV_B32
-
-    Inst_VOP3__V_MOV_B32::~Inst_VOP3__V_MOV_B32()
-    {
-    } // ~Inst_VOP3__V_MOV_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u.
-    // Input and output modifiers not supported; this is an untyped operation.
-    void
-    Inst_VOP3__V_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_I32_F64 class methods ---
-
-    Inst_VOP3__V_CVT_I32_F64::Inst_VOP3__V_CVT_I32_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_i32_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CVT_I32_F64
-
-    Inst_VOP3__V_CVT_I32_F64::~Inst_VOP3__V_CVT_I32_F64()
-    {
-    } // ~Inst_VOP3__V_CVT_I32_F64
-
-    // --- description from .arch file ---
-    // D.i = (int)S0.d.
-    // Out-of-range floating point values (including infinity) saturate. NaN is
-    // ---  converted to 0.
-    void
-    Inst_VOP3__V_CVT_I32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane]) || exp > 30) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = INT_MIN;
-                    } else {
-                        vdst[lane] = INT_MAX;
-                    }
-                } else {
-                    vdst[lane] = (VecElemI32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F64_I32 class methods ---
-
-    Inst_VOP3__V_CVT_F64_I32::Inst_VOP3__V_CVT_F64_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f64_i32", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CVT_F64_I32
-
-    Inst_VOP3__V_CVT_F64_I32::~Inst_VOP3__V_CVT_F64_I32()
-    {
-    } // ~Inst_VOP3__V_CVT_F64_I32
-
-    // --- description from .arch file ---
-    // D.d = (double)S0.i.
-    void
-    Inst_VOP3__V_CVT_F64_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF64)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F32_I32 class methods ---
-
-    Inst_VOP3__V_CVT_F32_I32::Inst_VOP3__V_CVT_F32_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f32_i32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F32_I32
-
-    Inst_VOP3__V_CVT_F32_I32::~Inst_VOP3__V_CVT_F32_I32()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_I32
-
-    // --- description from .arch file ---
-    // D.f = (float)S0.i.
-    void
-    Inst_VOP3__V_CVT_F32_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        VecOperandI32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F32_U32 class methods ---
-
-    Inst_VOP3__V_CVT_F32_U32::Inst_VOP3__V_CVT_F32_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f32_u32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F32_U32
-
-    Inst_VOP3__V_CVT_F32_U32::~Inst_VOP3__V_CVT_F32_U32()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_U32
-
-    // --- description from .arch file ---
-    // D.f = (float)S0.u.
-    void
-    Inst_VOP3__V_CVT_F32_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_U32_F32 class methods ---
-
-    Inst_VOP3__V_CVT_U32_F32::Inst_VOP3__V_CVT_U32_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_u32_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_U32_F32
-
-    Inst_VOP3__V_CVT_U32_F32::~Inst_VOP3__V_CVT_U32_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_U32_F32
-
-    // --- description from .arch file ---
-    // D.u = (unsigned)S0.f.
-    // Out-of-range floating point values (including infinity) saturate. NaN is
-    // ---  converted to 0.
-    void
-    Inst_VOP3__V_CVT_U32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane])) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = 0;
-                    } else {
-                        vdst[lane] = UINT_MAX;
-                    }
-                } else if (exp > 31) {
-                    vdst[lane] = UINT_MAX;
-                } else {
-                    vdst[lane] = (VecElemU32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_I32_F32 class methods ---
-
-    Inst_VOP3__V_CVT_I32_F32::Inst_VOP3__V_CVT_I32_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_i32_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_I32_F32
-
-    Inst_VOP3__V_CVT_I32_F32::~Inst_VOP3__V_CVT_I32_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_I32_F32
-
-    // --- description from .arch file ---
-    // D.i = (int)S0.f.
-    // Out-of-range floating point values (including infinity) saturate. NaN is
-    // ---  converted to 0.
-    void
-    Inst_VOP3__V_CVT_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane]) || exp > 30) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = INT_MIN;
-                    } else {
-                        vdst[lane] = INT_MAX;
-                    }
-                } else {
-                    vdst[lane] = (VecElemI32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MOV_FED_B32 class methods ---
-
-    Inst_VOP3__V_MOV_FED_B32::Inst_VOP3__V_MOV_FED_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mov_fed_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MOV_FED_B32
-
-    Inst_VOP3__V_MOV_FED_B32::~Inst_VOP3__V_MOV_FED_B32()
-    {
-    } // ~Inst_VOP3__V_MOV_FED_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u;
-    // Introduce EDC double error upon write to dest vgpr without causing an
-    // ---  exception.
-    // Input and output modifiers not supported; this is an untyped operation.
-    void
-    Inst_VOP3__V_MOV_FED_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F16_F32 class methods ---
-
-    Inst_VOP3__V_CVT_F16_F32::Inst_VOP3__V_CVT_F16_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f16_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F16_F32
-
-    Inst_VOP3__V_CVT_F16_F32::~Inst_VOP3__V_CVT_F16_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_F16_F32
-
-    // --- description from .arch file ---
-    // D.f16 = flt32_to_flt16(S0.f).
-    // Supports input modifiers and creates FP16 denormals when appropriate.
-    void
-    Inst_VOP3__V_CVT_F16_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F32_F16 class methods ---
-
-    Inst_VOP3__V_CVT_F32_F16::Inst_VOP3__V_CVT_F32_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f32_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F32_F16
-
-    Inst_VOP3__V_CVT_F32_F16::~Inst_VOP3__V_CVT_F32_F16()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_F16
-
-    // --- description from .arch file ---
-    // D.f = flt16_to_flt32(S0.f16).
-    // FP16 denormal inputs are always accepted.
-    void
-    Inst_VOP3__V_CVT_F32_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_RPI_I32_F32 class methods ---
-
-    Inst_VOP3__V_CVT_RPI_I32_F32::Inst_VOP3__V_CVT_RPI_I32_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_rpi_i32_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_RPI_I32_F32
-
-    Inst_VOP3__V_CVT_RPI_I32_F32::~Inst_VOP3__V_CVT_RPI_I32_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_RPI_I32_F32
-
-    // --- description from .arch file ---
-    // D.i = (int)floor(S0.f + 0.5).
-    void
-    Inst_VOP3__V_CVT_RPI_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemI32)std::floor(src[lane] + 0.5);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_FLR_I32_F32 class methods ---
-
-    Inst_VOP3__V_CVT_FLR_I32_F32::Inst_VOP3__V_CVT_FLR_I32_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_flr_i32_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_FLR_I32_F32
-
-    Inst_VOP3__V_CVT_FLR_I32_F32::~Inst_VOP3__V_CVT_FLR_I32_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_FLR_I32_F32
-
-    // --- description from .arch file ---
-    // D.i = (int)floor(S0.f).
-    void
-    Inst_VOP3__V_CVT_FLR_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemI32)std::floor(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_OFF_F32_I4 class methods ---
-
-    Inst_VOP3__V_CVT_OFF_F32_I4::Inst_VOP3__V_CVT_OFF_F32_I4(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_off_f32_i4", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_OFF_F32_I4
-
-    Inst_VOP3__V_CVT_OFF_F32_I4::~Inst_VOP3__V_CVT_OFF_F32_I4()
-    {
-    } // ~Inst_VOP3__V_CVT_OFF_F32_I4
-
-    // --- description from .arch file ---
-    // 4-bit signed int to 32-bit float. Used for interpolation in shader.
-    void
-    Inst_VOP3__V_CVT_OFF_F32_I4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        // Could not parse sq_uc.arch desc field
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F32_F64 class methods ---
-
-    Inst_VOP3__V_CVT_F32_F64::Inst_VOP3__V_CVT_F32_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f32_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CVT_F32_F64
-
-    Inst_VOP3__V_CVT_F32_F64::~Inst_VOP3__V_CVT_F32_F64()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_F64
-
-    // --- description from .arch file ---
-    // D.f = (float)S0.d.
-    void
-    Inst_VOP3__V_CVT_F32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F64_F32 class methods ---
-
-    Inst_VOP3__V_CVT_F64_F32::Inst_VOP3__V_CVT_F64_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f64_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CVT_F64_F32
-
-    Inst_VOP3__V_CVT_F64_F32::~Inst_VOP3__V_CVT_F64_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_F64_F32
-
-    // --- description from .arch file ---
-    // D.d = (double)S0.f.
-    void
-    Inst_VOP3__V_CVT_F64_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF64)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F32_UBYTE0 class methods ---
-
-    Inst_VOP3__V_CVT_F32_UBYTE0::Inst_VOP3__V_CVT_F32_UBYTE0(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte0", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F32_UBYTE0
-
-    Inst_VOP3__V_CVT_F32_UBYTE0::~Inst_VOP3__V_CVT_F32_UBYTE0()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_UBYTE0
-
-    // --- description from .arch file ---
-    // D.f = (float)(S0.u[7:0]).
-    void
-    Inst_VOP3__V_CVT_F32_UBYTE0::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)bits(src[lane], 7, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F32_UBYTE1 class methods ---
-
-    Inst_VOP3__V_CVT_F32_UBYTE1::Inst_VOP3__V_CVT_F32_UBYTE1(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte1", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F32_UBYTE1
-
-    Inst_VOP3__V_CVT_F32_UBYTE1::~Inst_VOP3__V_CVT_F32_UBYTE1()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_UBYTE1
-
-    // --- description from .arch file ---
-    // D.f = (float)(S0.u[15:8]).
-    void
-    Inst_VOP3__V_CVT_F32_UBYTE1::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)bits(src[lane], 15, 8);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F32_UBYTE2 class methods ---
-
-    Inst_VOP3__V_CVT_F32_UBYTE2::Inst_VOP3__V_CVT_F32_UBYTE2(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte2", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F32_UBYTE2
-
-    Inst_VOP3__V_CVT_F32_UBYTE2::~Inst_VOP3__V_CVT_F32_UBYTE2()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_UBYTE2
-
-    // --- description from .arch file ---
-    // D.f = (float)(S0.u[23:16]).
-    void
-    Inst_VOP3__V_CVT_F32_UBYTE2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)bits(src[lane], 23, 16);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F32_UBYTE3 class methods ---
-
-    Inst_VOP3__V_CVT_F32_UBYTE3::Inst_VOP3__V_CVT_F32_UBYTE3(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte3", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F32_UBYTE3
-
-    Inst_VOP3__V_CVT_F32_UBYTE3::~Inst_VOP3__V_CVT_F32_UBYTE3()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_UBYTE3
-
-    // --- description from .arch file ---
-    // D.f = (float)(S0.u[31:24]).
-    void
-    Inst_VOP3__V_CVT_F32_UBYTE3::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)bits(src[lane], 31, 24);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_U32_F64 class methods ---
-
-    Inst_VOP3__V_CVT_U32_F64::Inst_VOP3__V_CVT_U32_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_u32_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CVT_U32_F64
-
-    Inst_VOP3__V_CVT_U32_F64::~Inst_VOP3__V_CVT_U32_F64()
-    {
-    } // ~Inst_VOP3__V_CVT_U32_F64
-
-    // --- description from .arch file ---
-    // D.u = (unsigned)S0.d.
-    // Out-of-range floating point values (including infinity) saturate. NaN is
-    // ---  converted to 0.
-    void
-    Inst_VOP3__V_CVT_U32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane])) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = 0;
-                    } else {
-                        vdst[lane] = UINT_MAX;
-                    }
-                } else if (exp > 31) {
-                    vdst[lane] = UINT_MAX;
-                } else {
-                    vdst[lane] = (VecElemU32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F64_U32 class methods ---
-
-    Inst_VOP3__V_CVT_F64_U32::Inst_VOP3__V_CVT_F64_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f64_u32", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CVT_F64_U32
-
-    Inst_VOP3__V_CVT_F64_U32::~Inst_VOP3__V_CVT_F64_U32()
-    {
-    } // ~Inst_VOP3__V_CVT_F64_U32
-
-    // --- description from .arch file ---
-    // D.d = (double)S0.u.
-    void
-    Inst_VOP3__V_CVT_F64_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF64)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_TRUNC_F64 class methods ---
-
-    Inst_VOP3__V_TRUNC_F64::Inst_VOP3__V_TRUNC_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_trunc_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_TRUNC_F64
-
-    Inst_VOP3__V_TRUNC_F64::~Inst_VOP3__V_TRUNC_F64()
-    {
-    } // ~Inst_VOP3__V_TRUNC_F64
-
-    // --- description from .arch file ---
-    // D.d = trunc(S0.d), return integer part of S0.d.
-    void
-    Inst_VOP3__V_TRUNC_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::trunc(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CEIL_F64 class methods ---
-
-    Inst_VOP3__V_CEIL_F64::Inst_VOP3__V_CEIL_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ceil_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CEIL_F64
-
-    Inst_VOP3__V_CEIL_F64::~Inst_VOP3__V_CEIL_F64()
-    {
-    } // ~Inst_VOP3__V_CEIL_F64
-
-    // --- description from .arch file ---
-    // D.d = trunc(S0.d);
-    // if(S0.d > 0.0 && S0.d != D.d) then D.d += 1.0.
-    void
-    Inst_VOP3__V_CEIL_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::ceil(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_RNDNE_F64 class methods ---
-
-    Inst_VOP3__V_RNDNE_F64::Inst_VOP3__V_RNDNE_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rndne_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_RNDNE_F64
-
-    Inst_VOP3__V_RNDNE_F64::~Inst_VOP3__V_RNDNE_F64()
-    {
-    } // ~Inst_VOP3__V_RNDNE_F64
-
-    // --- description from .arch file ---
-    // D.d = round_nearest_even(S0.d).
-    void
-    Inst_VOP3__V_RNDNE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = roundNearestEven(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FLOOR_F64 class methods ---
-
-    Inst_VOP3__V_FLOOR_F64::Inst_VOP3__V_FLOOR_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_floor_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_FLOOR_F64
-
-    Inst_VOP3__V_FLOOR_F64::~Inst_VOP3__V_FLOOR_F64()
-    {
-    } // ~Inst_VOP3__V_FLOOR_F64
-
-    // --- description from .arch file ---
-    // D.d = trunc(S0.d);
-    // if(S0.d < 0.0 && S0.d != D.d) then D.d += -1.0.
-    void
-    Inst_VOP3__V_FLOOR_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::floor(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FRACT_F32 class methods ---
-
-    Inst_VOP3__V_FRACT_F32::Inst_VOP3__V_FRACT_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_fract_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_FRACT_F32
-
-    Inst_VOP3__V_FRACT_F32::~Inst_VOP3__V_FRACT_F32()
-    {
-    } // ~Inst_VOP3__V_FRACT_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f - floor(S0.f).
-    void
-    Inst_VOP3__V_FRACT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemF32 int_part(0.0);
-                vdst[lane] = std::modf(src[lane], &int_part);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_TRUNC_F32 class methods ---
-
-    Inst_VOP3__V_TRUNC_F32::Inst_VOP3__V_TRUNC_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_trunc_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_TRUNC_F32
-
-    Inst_VOP3__V_TRUNC_F32::~Inst_VOP3__V_TRUNC_F32()
-    {
-    } // ~Inst_VOP3__V_TRUNC_F32
-
-    // --- description from .arch file ---
-    // D.f = trunc(S0.f), return integer part of S0.f.
-    void
-    Inst_VOP3__V_TRUNC_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::trunc(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CEIL_F32 class methods ---
-
-    Inst_VOP3__V_CEIL_F32::Inst_VOP3__V_CEIL_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ceil_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CEIL_F32
-
-    Inst_VOP3__V_CEIL_F32::~Inst_VOP3__V_CEIL_F32()
-    {
-    } // ~Inst_VOP3__V_CEIL_F32
-
-    // --- description from .arch file ---
-    // D.f = trunc(S0.f);
-    // if(S0.f > 0.0 && S0.f != D.f) then D.f += 1.0.
-    void
-    Inst_VOP3__V_CEIL_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::ceil(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_RNDNE_F32 class methods ---
-
-    Inst_VOP3__V_RNDNE_F32::Inst_VOP3__V_RNDNE_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rndne_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_RNDNE_F32
-
-    Inst_VOP3__V_RNDNE_F32::~Inst_VOP3__V_RNDNE_F32()
-    {
-    } // ~Inst_VOP3__V_RNDNE_F32
-
-    // --- description from .arch file ---
-    // D.f = round_nearest_even(S0.f).
-    void
-    Inst_VOP3__V_RNDNE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = roundNearestEven(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FLOOR_F32 class methods ---
-
-    Inst_VOP3__V_FLOOR_F32::Inst_VOP3__V_FLOOR_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_floor_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_FLOOR_F32
-
-    Inst_VOP3__V_FLOOR_F32::~Inst_VOP3__V_FLOOR_F32()
-    {
-    } // ~Inst_VOP3__V_FLOOR_F32
-
-    // --- description from .arch file ---
-    // D.f = trunc(S0.f);
-    // if(S0.f < 0.0 && S0.f != D.f) then D.f += -1.0.
-    void
-    Inst_VOP3__V_FLOOR_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::floor(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_EXP_F32 class methods ---
-
-    Inst_VOP3__V_EXP_F32::Inst_VOP3__V_EXP_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_exp_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_EXP_F32
-
-    Inst_VOP3__V_EXP_F32::~Inst_VOP3__V_EXP_F32()
-    {
-    } // ~Inst_VOP3__V_EXP_F32
-
-    // --- description from .arch file ---
-    // D.f = pow(2.0, S0.f).
-    void
-    Inst_VOP3__V_EXP_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::pow(2.0, src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LOG_F32 class methods ---
-
-    Inst_VOP3__V_LOG_F32::Inst_VOP3__V_LOG_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_log_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_LOG_F32
-
-    Inst_VOP3__V_LOG_F32::~Inst_VOP3__V_LOG_F32()
-    {
-    } // ~Inst_VOP3__V_LOG_F32
-
-    // --- description from .arch file ---
-    // D.f = log2(S0.f). Base 2 logarithm.
-    void
-    Inst_VOP3__V_LOG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::log2(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_RCP_F32 class methods ---
-
-    Inst_VOP3__V_RCP_F32::Inst_VOP3__V_RCP_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rcp_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_RCP_F32
-
-    Inst_VOP3__V_RCP_F32::~Inst_VOP3__V_RCP_F32()
-    {
-    } // ~Inst_VOP3__V_RCP_F32
-
-    // --- description from .arch file ---
-    // D.f = 1.0 / S0.f. Reciprocal with IEEE rules and < 1ulp error.
-    void
-    Inst_VOP3__V_RCP_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = 1.0 / src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_RCP_IFLAG_F32 class methods ---
-
-    Inst_VOP3__V_RCP_IFLAG_F32::Inst_VOP3__V_RCP_IFLAG_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rcp_iflag_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_RCP_IFLAG_F32
-
-    Inst_VOP3__V_RCP_IFLAG_F32::~Inst_VOP3__V_RCP_IFLAG_F32()
-    {
-    } // ~Inst_VOP3__V_RCP_IFLAG_F32
-
-    // --- description from .arch file ---
-    // D.f = 1.0 / S0.f. Reciprocal intended for integer division, can raise
-    // ---  integer DIV_BY_ZERO exception but cannot raise floating-point
-    // ---  exceptions.
-    void
-    Inst_VOP3__V_RCP_IFLAG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = 1.0 / src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_RSQ_F32 class methods ---
-
-    Inst_VOP3__V_RSQ_F32::Inst_VOP3__V_RSQ_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rsq_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_RSQ_F32
-
-    Inst_VOP3__V_RSQ_F32::~Inst_VOP3__V_RSQ_F32()
-    {
-    } // ~Inst_VOP3__V_RSQ_F32
-
-    // --- description from .arch file ---
-    // D.f = 1.0 / sqrt(S0.f). Reciprocal square root with IEEE rules.
-    void
-    Inst_VOP3__V_RSQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = 1.0 / std::sqrt(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_RCP_F64 class methods ---
-
-    Inst_VOP3__V_RCP_F64::Inst_VOP3__V_RCP_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rcp_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_RCP_F64
-
-    Inst_VOP3__V_RCP_F64::~Inst_VOP3__V_RCP_F64()
-    {
-    } // ~Inst_VOP3__V_RCP_F64
-
-    // --- description from .arch file ---
-    // D.d = 1.0 / S0.d.
-    void
-    Inst_VOP3__V_RCP_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::fpclassify(src[lane]) == FP_ZERO) {
-                    vdst[lane] = +INFINITY;
-                } else if (std::isnan(src[lane])) {
-                    vdst[lane] = NAN;
-                } else if (std::isinf(src[lane])) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = -0.0;
-                    } else {
-                        vdst[lane] = 0.0;
-                    }
-                } else {
-                    vdst[lane] = 1.0 / src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_RSQ_F64 class methods ---
-
-    Inst_VOP3__V_RSQ_F64::Inst_VOP3__V_RSQ_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rsq_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_RSQ_F64
-
-    Inst_VOP3__V_RSQ_F64::~Inst_VOP3__V_RSQ_F64()
-    {
-    } // ~Inst_VOP3__V_RSQ_F64
-
-    // --- description from .arch file ---
-    // D.d = 1.0 / sqrt(S0.d). See V_RSQ_F32.
-    void
-    Inst_VOP3__V_RSQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::fpclassify(src[lane]) == FP_ZERO) {
-                    vdst[lane] = +INFINITY;
-                } else if (std::isnan(src[lane])) {
-                    vdst[lane] = NAN;
-                } else if (std::isinf(src[lane]) && !std::signbit(src[lane])) {
-                    vdst[lane] = 0.0;
-                } else if (std::signbit(src[lane])) {
-                    vdst[lane] = NAN;
-                } else {
-                    vdst[lane] = 1.0 / std::sqrt(src[lane]);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SQRT_F32 class methods ---
-
-    Inst_VOP3__V_SQRT_F32::Inst_VOP3__V_SQRT_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sqrt_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_SQRT_F32
-
-    Inst_VOP3__V_SQRT_F32::~Inst_VOP3__V_SQRT_F32()
-    {
-    } // ~Inst_VOP3__V_SQRT_F32
-
-    // --- description from .arch file ---
-    // D.f = sqrt(S0.f).
-    void
-    Inst_VOP3__V_SQRT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::sqrt(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SQRT_F64 class methods ---
-
-    Inst_VOP3__V_SQRT_F64::Inst_VOP3__V_SQRT_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sqrt_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_SQRT_F64
-
-    Inst_VOP3__V_SQRT_F64::~Inst_VOP3__V_SQRT_F64()
-    {
-    } // ~Inst_VOP3__V_SQRT_F64
-
-    // --- description from .arch file ---
-    // D.d = sqrt(S0.d).
-    void
-    Inst_VOP3__V_SQRT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::sqrt(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SIN_F32 class methods ---
-
-    Inst_VOP3__V_SIN_F32::Inst_VOP3__V_SIN_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sin_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_SIN_F32
-
-    Inst_VOP3__V_SIN_F32::~Inst_VOP3__V_SIN_F32()
-    {
-    } // ~Inst_VOP3__V_SIN_F32
-
-    // --- description from .arch file ---
-    // D.f = sin(S0.f * 2 * PI).
-    // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
-    // float 0.0.
-    void
-    Inst_VOP3__V_SIN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-        pi.read();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::sin(src[lane] * 2 * pi.rawData());
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_COS_F32 class methods ---
-
-    Inst_VOP3__V_COS_F32::Inst_VOP3__V_COS_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cos_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_COS_F32
-
-    Inst_VOP3__V_COS_F32::~Inst_VOP3__V_COS_F32()
-    {
-    } // ~Inst_VOP3__V_COS_F32
-
-    // --- description from .arch file ---
-    // D.f = cos(S0.f * 2 * PI).
-    // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
-    // float 1.0.
-    void
-    Inst_VOP3__V_COS_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-        pi.read();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::cos(src[lane] * 2 * pi.rawData());
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_NOT_B32 class methods ---
-
-    Inst_VOP3__V_NOT_B32::Inst_VOP3__V_NOT_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_not_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_NOT_B32
-
-    Inst_VOP3__V_NOT_B32::~Inst_VOP3__V_NOT_B32()
-    {
-    } // ~Inst_VOP3__V_NOT_B32
-
-    // --- description from .arch file ---
-    // D.u = ~S0.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP3__V_NOT_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = ~src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_BFREV_B32 class methods ---
-
-    Inst_VOP3__V_BFREV_B32::Inst_VOP3__V_BFREV_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_bfrev_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_BFREV_B32
-
-    Inst_VOP3__V_BFREV_B32::~Inst_VOP3__V_BFREV_B32()
-    {
-    } // ~Inst_VOP3__V_BFREV_B32
-
-    // --- description from .arch file ---
-    // D.u[31:0] = S0.u[0:31], bitfield reverse.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP3__V_BFREV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = reverseBits(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FFBH_U32 class methods ---
-
-    Inst_VOP3__V_FFBH_U32::Inst_VOP3__V_FFBH_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ffbh_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_FFBH_U32
-
-    Inst_VOP3__V_FFBH_U32::~Inst_VOP3__V_FFBH_U32()
-    {
-    } // ~Inst_VOP3__V_FFBH_U32
-
-    // --- description from .arch file ---
-    // D.u = position of first 1 in S0.u from MSB;
-    // D.u = 0xffffffff if S0.u == 0.
-    void
-    Inst_VOP3__V_FFBH_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = findFirstOneMsb(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FFBL_B32 class methods ---
-
-    Inst_VOP3__V_FFBL_B32::Inst_VOP3__V_FFBL_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ffbl_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_FFBL_B32
-
-    Inst_VOP3__V_FFBL_B32::~Inst_VOP3__V_FFBL_B32()
-    {
-    } // ~Inst_VOP3__V_FFBL_B32
-
-    // --- description from .arch file ---
-    // D.u = position of first 1 in S0.u from LSB;
-    // D.u = 0xffffffff if S0.u == 0.
-    void
-    Inst_VOP3__V_FFBL_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = findFirstOne(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FFBH_I32 class methods ---
-
-    Inst_VOP3__V_FFBH_I32::Inst_VOP3__V_FFBH_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ffbh_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_FFBH_I32
-
-    Inst_VOP3__V_FFBH_I32::~Inst_VOP3__V_FFBH_I32()
-    {
-    } // ~Inst_VOP3__V_FFBH_I32
-
-    // --- description from .arch file ---
-    // D.u = position of first bit different from sign bit in S0.i from MSB;
-    // D.u = 0xffffffff if S0.i == 0 or S0.i == 0xffffffff.
-    void
-    Inst_VOP3__V_FFBH_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = firstOppositeSignBit(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FREXP_EXP_I32_F64 class methods ---
-
-    Inst_VOP3__V_FREXP_EXP_I32_F64::Inst_VOP3__V_FREXP_EXP_I32_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_frexp_exp_i32_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_FREXP_EXP_I32_F64
-
-    Inst_VOP3__V_FREXP_EXP_I32_F64::~Inst_VOP3__V_FREXP_EXP_I32_F64()
-    {
-    } // ~Inst_VOP3__V_FREXP_EXP_I32_F64
-
-    // --- description from .arch file ---
-    // See V_FREXP_EXP_I32_F32.
-    void
-    Inst_VOP3__V_FREXP_EXP_I32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else {
-                    VecElemI32 exp(0);
-                    std::frexp(src[lane], &exp);
-                    vdst[lane] = exp;
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FREXP_MANT_F64 class methods ---
-
-    Inst_VOP3__V_FREXP_MANT_F64::Inst_VOP3__V_FREXP_MANT_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_frexp_mant_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_FREXP_MANT_F64
-
-    Inst_VOP3__V_FREXP_MANT_F64::~Inst_VOP3__V_FREXP_MANT_F64()
-    {
-    } // ~Inst_VOP3__V_FREXP_MANT_F64
-
-    // --- description from .arch file ---
-    // See V_FREXP_MANT_F32.
-    void
-    Inst_VOP3__V_FREXP_MANT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI32 exp(0);
-                vdst[lane] = std::frexp(src[lane], &exp);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FRACT_F64 class methods ---
-
-    Inst_VOP3__V_FRACT_F64::Inst_VOP3__V_FRACT_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_fract_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_FRACT_F64
-
-    Inst_VOP3__V_FRACT_F64::~Inst_VOP3__V_FRACT_F64()
-    {
-    } // ~Inst_VOP3__V_FRACT_F64
-
-    // --- description from .arch file ---
-    // See V_FRACT_F32.
-    void
-    Inst_VOP3__V_FRACT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemF32 int_part(0.0);
-                vdst[lane] = std::modf(src[lane], &int_part);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FREXP_EXP_I32_F32 class methods ---
-
-    Inst_VOP3__V_FREXP_EXP_I32_F32::Inst_VOP3__V_FREXP_EXP_I32_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_frexp_exp_i32_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_FREXP_EXP_I32_F32
-
-    Inst_VOP3__V_FREXP_EXP_I32_F32::~Inst_VOP3__V_FREXP_EXP_I32_F32()
-    {
-    } // ~Inst_VOP3__V_FREXP_EXP_I32_F32
-
-    // --- description from .arch file ---
-    // if(S0.f == INF || S0.f == NAN) then D.i = 0;
-    // else D.i = TwosComplement(Exponent(S0.f) - 127 + 1).
-    // Returns exponent of single precision float input, such that S0.f =
-    // significand * (2 ** exponent). See also FREXP_MANT_F32, which returns
-    // the significand.
-    void
-    Inst_VOP3__V_FREXP_EXP_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isinf(src[lane])|| std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else {
-                    VecElemI32 exp(0);
-                    std::frexp(src[lane], &exp);
-                    vdst[lane] = exp;
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FREXP_MANT_F32 class methods ---
-
-    Inst_VOP3__V_FREXP_MANT_F32::Inst_VOP3__V_FREXP_MANT_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_frexp_mant_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_FREXP_MANT_F32
-
-    Inst_VOP3__V_FREXP_MANT_F32::~Inst_VOP3__V_FREXP_MANT_F32()
-    {
-    } // ~Inst_VOP3__V_FREXP_MANT_F32
-
-    // --- description from .arch file ---
-    // if(S0.f == INF || S0.f == NAN) then D.f = S0.f;
-    // else D.f = Mantissa(S0.f).
-    // Result range is in (-1.0,-0.5][0.5,1.0) in normal cases. Returns binary
-    // ---  significand of single precision float input, such that S0.f =
-    // ---  significand * (2 ** exponent). See also FREXP_EXP_I32_F32, which
-    // ---  returns integer exponent.
-    void
-    Inst_VOP3__V_FREXP_MANT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
-                    vdst[lane] = src[lane];
-                } else {
-                    VecElemI32 exp(0);
-                    vdst[lane] = std::frexp(src[lane], &exp);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CLREXCP class methods ---
-
-    Inst_VOP3__V_CLREXCP::Inst_VOP3__V_CLREXCP(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_clrexcp", false)
-    {
-    } // Inst_VOP3__V_CLREXCP
-
-    Inst_VOP3__V_CLREXCP::~Inst_VOP3__V_CLREXCP()
-    {
-    } // ~Inst_VOP3__V_CLREXCP
-
-    // --- description from .arch file ---
-    // Clear wave's exception state in SIMD (SP).
-    void
-    Inst_VOP3__V_CLREXCP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F16_U16 class methods ---
-
-    Inst_VOP3__V_CVT_F16_U16::Inst_VOP3__V_CVT_F16_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f16_u16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CVT_F16_U16
-
-    Inst_VOP3__V_CVT_F16_U16::~Inst_VOP3__V_CVT_F16_U16()
-    {
-    } // ~Inst_VOP3__V_CVT_F16_U16
-
-    // --- description from .arch file ---
-    // D.f16 = uint16_to_flt16(S.u16).
-    // Supports denormals, rounding, exception flags and saturation.
-    void
-    Inst_VOP3__V_CVT_F16_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F16_I16 class methods ---
-
-    Inst_VOP3__V_CVT_F16_I16::Inst_VOP3__V_CVT_F16_I16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f16_i16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CVT_F16_I16
-
-    Inst_VOP3__V_CVT_F16_I16::~Inst_VOP3__V_CVT_F16_I16()
-    {
-    } // ~Inst_VOP3__V_CVT_F16_I16
-
-    // --- description from .arch file ---
-    // D.f16 = int16_to_flt16(S.i16).
-    // Supports denormals, rounding, exception flags and saturation.
-    void
-    Inst_VOP3__V_CVT_F16_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_U16_F16 class methods ---
-
-    Inst_VOP3__V_CVT_U16_F16::Inst_VOP3__V_CVT_U16_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_u16_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CVT_U16_F16
-
-    Inst_VOP3__V_CVT_U16_F16::~Inst_VOP3__V_CVT_U16_F16()
-    {
-    } // ~Inst_VOP3__V_CVT_U16_F16
-
-    // --- description from .arch file ---
-    // D.u16 = flt16_to_uint16(S.f16).
-    // Supports rounding, exception flags and saturation.
-    void
-    Inst_VOP3__V_CVT_U16_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_I16_F16 class methods ---
-
-    Inst_VOP3__V_CVT_I16_F16::Inst_VOP3__V_CVT_I16_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_i16_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CVT_I16_F16
-
-    Inst_VOP3__V_CVT_I16_F16::~Inst_VOP3__V_CVT_I16_F16()
-    {
-    } // ~Inst_VOP3__V_CVT_I16_F16
-
-    // --- description from .arch file ---
-    // D.i16 = flt16_to_int16(S.f16).
-    // Supports rounding, exception flags and saturation.
-    void
-    Inst_VOP3__V_CVT_I16_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_RCP_F16 class methods ---
-
-    Inst_VOP3__V_RCP_F16::Inst_VOP3__V_RCP_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rcp_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_RCP_F16
-
-    Inst_VOP3__V_RCP_F16::~Inst_VOP3__V_RCP_F16()
-    {
-    } // ~Inst_VOP3__V_RCP_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 1.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = ApproximateRecip(S0.f16).
-    void
-    Inst_VOP3__V_RCP_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_SQRT_F16 class methods ---
-
-    Inst_VOP3__V_SQRT_F16::Inst_VOP3__V_SQRT_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sqrt_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_SQRT_F16
-
-    Inst_VOP3__V_SQRT_F16::~Inst_VOP3__V_SQRT_F16()
-    {
-    } // ~Inst_VOP3__V_SQRT_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 1.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = ApproximateSqrt(S0.f16).
-    void
-    Inst_VOP3__V_SQRT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_RSQ_F16 class methods ---
-
-    Inst_VOP3__V_RSQ_F16::Inst_VOP3__V_RSQ_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rsq_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_RSQ_F16
-
-    Inst_VOP3__V_RSQ_F16::~Inst_VOP3__V_RSQ_F16()
-    {
-    } // ~Inst_VOP3__V_RSQ_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 1.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = ApproximateRecipSqrt(S0.f16).
-    void
-    Inst_VOP3__V_RSQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_LOG_F16 class methods ---
-
-    Inst_VOP3__V_LOG_F16::Inst_VOP3__V_LOG_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_log_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_LOG_F16
-
-    Inst_VOP3__V_LOG_F16::~Inst_VOP3__V_LOG_F16()
-    {
-    } // ~Inst_VOP3__V_LOG_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 1.0f)
-    //     D.f16 = 0.0f;
-    // else
-    //     D.f16 = ApproximateLog2(S0.f16).
-    void
-    Inst_VOP3__V_LOG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_EXP_F16 class methods ---
-
-    Inst_VOP3__V_EXP_F16::Inst_VOP3__V_EXP_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_exp_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_EXP_F16
-
-    Inst_VOP3__V_EXP_F16::~Inst_VOP3__V_EXP_F16()
-    {
-    } // ~Inst_VOP3__V_EXP_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 0.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = Approximate2ToX(S0.f16).
-    void
-    Inst_VOP3__V_EXP_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_FREXP_MANT_F16 class methods ---
-
-    Inst_VOP3__V_FREXP_MANT_F16::Inst_VOP3__V_FREXP_MANT_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_frexp_mant_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_FREXP_MANT_F16
-
-    Inst_VOP3__V_FREXP_MANT_F16::~Inst_VOP3__V_FREXP_MANT_F16()
-    {
-    } // ~Inst_VOP3__V_FREXP_MANT_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == +-INF || S0.f16 == NAN)
-    //     D.f16 = S0.f16;
-    // else
-    //     D.f16 = mantissa(S0.f16).
-    // Result range is (-1.0,-0.5][0.5,1.0).
-    // C math library frexp function.
-    // Returns binary significand of half precision float input, such that the
-    // original single float = significand * (2 ** exponent).
-    void
-    Inst_VOP3__V_FREXP_MANT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_FREXP_EXP_I16_F16 class methods ---
-
-    Inst_VOP3__V_FREXP_EXP_I16_F16::Inst_VOP3__V_FREXP_EXP_I16_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_frexp_exp_i16_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_FREXP_EXP_I16_F16
-
-    Inst_VOP3__V_FREXP_EXP_I16_F16::~Inst_VOP3__V_FREXP_EXP_I16_F16()
-    {
-    } // ~Inst_VOP3__V_FREXP_EXP_I16_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == +-INF || S0.f16 == NAN)
-    //     D.i16 = 0;
-    // else
-    //     D.i16 = 2s_complement(exponent(S0.f16) - 15 + 1).
-    // C math library frexp function.
-    // Returns exponent of half precision float input, such that the
-    // original single float = significand * (2 ** exponent).
-    void
-    Inst_VOP3__V_FREXP_EXP_I16_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_FLOOR_F16 class methods ---
-
-    Inst_VOP3__V_FLOOR_F16::Inst_VOP3__V_FLOOR_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_floor_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_FLOOR_F16
-
-    Inst_VOP3__V_FLOOR_F16::~Inst_VOP3__V_FLOOR_F16()
-    {
-    } // ~Inst_VOP3__V_FLOOR_F16
-
-    // --- description from .arch file ---
-    // D.f16 = trunc(S0.f16);
-    // if(S0.f16 < 0.0f && S0.f16 != D.f16) then D.f16 -= 1.0f.
-    void
-    Inst_VOP3__V_FLOOR_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CEIL_F16 class methods ---
-
-    Inst_VOP3__V_CEIL_F16::Inst_VOP3__V_CEIL_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ceil_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CEIL_F16
-
-    Inst_VOP3__V_CEIL_F16::~Inst_VOP3__V_CEIL_F16()
-    {
-    } // ~Inst_VOP3__V_CEIL_F16
-
-    // --- description from .arch file ---
-    // D.f16 = trunc(S0.f16);
-    // if(S0.f16 > 0.0f && S0.f16 != D.f16) then D.f16 += 1.0f.
-    void
-    Inst_VOP3__V_CEIL_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_TRUNC_F16 class methods ---
-
-    Inst_VOP3__V_TRUNC_F16::Inst_VOP3__V_TRUNC_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_trunc_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_TRUNC_F16
-
-    Inst_VOP3__V_TRUNC_F16::~Inst_VOP3__V_TRUNC_F16()
-    {
-    } // ~Inst_VOP3__V_TRUNC_F16
-
-    // --- description from .arch file ---
-    // D.f16 = trunc(S0.f16).
-    // Round-to-zero semantics.
-    void
-    Inst_VOP3__V_TRUNC_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_RNDNE_F16 class methods ---
-
-    Inst_VOP3__V_RNDNE_F16::Inst_VOP3__V_RNDNE_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rndne_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_RNDNE_F16
-
-    Inst_VOP3__V_RNDNE_F16::~Inst_VOP3__V_RNDNE_F16()
-    {
-    } // ~Inst_VOP3__V_RNDNE_F16
-
-    // --- description from .arch file ---
-    // D.f16 = FLOOR(S0.f16 + 0.5f);
-    // if(floor(S0.f16) is even && fract(S0.f16) == 0.5f) then D.f16 -= 1.0f.
-    // Round-to-nearest-even semantics.
-    void
-    Inst_VOP3__V_RNDNE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_FRACT_F16 class methods ---
-
-    Inst_VOP3__V_FRACT_F16::Inst_VOP3__V_FRACT_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_fract_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_FRACT_F16
-
-    Inst_VOP3__V_FRACT_F16::~Inst_VOP3__V_FRACT_F16()
-    {
-    } // ~Inst_VOP3__V_FRACT_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 + -floor(S0.f16).
-    void
-    Inst_VOP3__V_FRACT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_SIN_F16 class methods ---
-
-    Inst_VOP3__V_SIN_F16::Inst_VOP3__V_SIN_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sin_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_SIN_F16
-
-    Inst_VOP3__V_SIN_F16::~Inst_VOP3__V_SIN_F16()
-    {
-    } // ~Inst_VOP3__V_SIN_F16
-
-    // --- description from .arch file ---
-    // D.f16 = sin(S0.f16 * 2 * PI).
-    void
-    Inst_VOP3__V_SIN_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_COS_F16 class methods ---
-
-    Inst_VOP3__V_COS_F16::Inst_VOP3__V_COS_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cos_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_COS_F16
-
-    Inst_VOP3__V_COS_F16::~Inst_VOP3__V_COS_F16()
-    {
-    } // ~Inst_VOP3__V_COS_F16
-
-    // --- description from .arch file ---
-    // D.f16 = cos(S0.f16 * 2 * PI).
-    void
-    Inst_VOP3__V_COS_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_EXP_LEGACY_F32 class methods ---
-
-    Inst_VOP3__V_EXP_LEGACY_F32::Inst_VOP3__V_EXP_LEGACY_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_exp_legacy_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_EXP_LEGACY_F32
-
-    Inst_VOP3__V_EXP_LEGACY_F32::~Inst_VOP3__V_EXP_LEGACY_F32()
-    {
-    } // ~Inst_VOP3__V_EXP_LEGACY_F32
-
-    // --- description from .arch file ---
-    // D.f = pow(2.0, S0.f) with legacy semantics.
-    void
-    Inst_VOP3__V_EXP_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::pow(2.0, src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LOG_LEGACY_F32 class methods ---
-
-    Inst_VOP3__V_LOG_LEGACY_F32::Inst_VOP3__V_LOG_LEGACY_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_log_legacy_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_LOG_LEGACY_F32
-
-    Inst_VOP3__V_LOG_LEGACY_F32::~Inst_VOP3__V_LOG_LEGACY_F32()
-    {
-    } // ~Inst_VOP3__V_LOG_LEGACY_F32
-
-    // --- description from .arch file ---
-    // D.f = log2(S0.f). Base 2 logarithm with legacy semantics.
-    void
-    Inst_VOP3__V_LOG_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::log2(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAD_LEGACY_F32 class methods ---
-
-    Inst_VOP3__V_MAD_LEGACY_F32::Inst_VOP3__V_MAD_LEGACY_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mad_legacy_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_LEGACY_F32
-
-    Inst_VOP3__V_MAD_LEGACY_F32::~Inst_VOP3__V_MAD_LEGACY_F32()
-    {
-    } // ~Inst_VOP3__V_MAD_LEGACY_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f + S2.f (DX9 rules, 0.0 * x = 0.0).
-    void
-    Inst_VOP3__V_MAD_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAD_F32 class methods ---
-
-    Inst_VOP3__V_MAD_F32::Inst_VOP3__V_MAD_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mad_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_F32
-
-    Inst_VOP3__V_MAD_F32::~Inst_VOP3__V_MAD_F32()
-    {
-    } // ~Inst_VOP3__V_MAD_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f + S2.f.
-    void
-    Inst_VOP3__V_MAD_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAD_I32_I24 class methods ---
-
-    Inst_VOP3__V_MAD_I32_I24::Inst_VOP3__V_MAD_I32_I24(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mad_i32_i24", false)
-    {
-        setFlag(ALU);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_I32_I24
-
-    Inst_VOP3__V_MAD_I32_I24::~Inst_VOP3__V_MAD_I32_I24()
-    {
-    } // ~Inst_VOP3__V_MAD_I32_I24
-
-    // --- description from .arch file ---
-    // D.i = S0.i[23:0] * S1.i[23:0] + S2.i.
-    void
-    Inst_VOP3__V_MAD_I32_I24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
-                    * sext<24>(bits(src1[lane], 23, 0)) + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAD_U32_U24 class methods ---
-
-    Inst_VOP3__V_MAD_U32_U24::Inst_VOP3__V_MAD_U32_U24(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mad_u32_u24", false)
-    {
-        setFlag(ALU);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_U32_U24
-
-    Inst_VOP3__V_MAD_U32_U24::~Inst_VOP3__V_MAD_U32_U24()
-    {
-    } // ~Inst_VOP3__V_MAD_U32_U24
-
-    // --- description from .arch file ---
-    // D.u = S0.u[23:0] * S1.u[23:0] + S2.u.
-    void
-    Inst_VOP3__V_MAD_U32_U24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0)
-                    + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CUBEID_F32 class methods ---
-
-    Inst_VOP3__V_CUBEID_F32::Inst_VOP3__V_CUBEID_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cubeid_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CUBEID_F32
-
-    Inst_VOP3__V_CUBEID_F32::~Inst_VOP3__V_CUBEID_F32()
-    {
-    } // ~Inst_VOP3__V_CUBEID_F32
-
-    // --- description from .arch file ---
-    // D.f = cubemap face ID ({0.0, 1.0, ..., 5.0}). XYZ coordinate is given in
-    // ---  (S0.f, S1.f, S2.f).
-    void
-    Inst_VOP3__V_CUBEID_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CUBESC_F32 class methods ---
-
-    Inst_VOP3__V_CUBESC_F32::Inst_VOP3__V_CUBESC_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cubesc_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CUBESC_F32
-
-    Inst_VOP3__V_CUBESC_F32::~Inst_VOP3__V_CUBESC_F32()
-    {
-    } // ~Inst_VOP3__V_CUBESC_F32
-
-    // --- description from .arch file ---
-    // D.f = cubemap S coordinate. XYZ coordinate is given in (S0.f, S1.f,
-    // S2.f).
-    void
-    Inst_VOP3__V_CUBESC_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CUBETC_F32 class methods ---
-
-    Inst_VOP3__V_CUBETC_F32::Inst_VOP3__V_CUBETC_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cubetc_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CUBETC_F32
-
-    Inst_VOP3__V_CUBETC_F32::~Inst_VOP3__V_CUBETC_F32()
-    {
-    } // ~Inst_VOP3__V_CUBETC_F32
-
-    // --- description from .arch file ---
-    // D.f = cubemap T coordinate. XYZ coordinate is given in (S0.f, S1.f,
-    // S2.f).
-    void
-    Inst_VOP3__V_CUBETC_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CUBEMA_F32 class methods ---
-
-    Inst_VOP3__V_CUBEMA_F32::Inst_VOP3__V_CUBEMA_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cubema_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CUBEMA_F32
-
-    Inst_VOP3__V_CUBEMA_F32::~Inst_VOP3__V_CUBEMA_F32()
-    {
-    } // ~Inst_VOP3__V_CUBEMA_F32
-
-    // --- description from .arch file ---
-    // D.f = 2.0 * cubemap major axis. XYZ coordinate is given in (S0.f, S1.f,
-    // ---  S2.f).
-    void
-    Inst_VOP3__V_CUBEMA_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_BFE_U32 class methods ---
-
-    Inst_VOP3__V_BFE_U32::Inst_VOP3__V_BFE_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_bfe_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_BFE_U32
-
-    Inst_VOP3__V_BFE_U32::~Inst_VOP3__V_BFE_U32()
-    {
-    } // ~Inst_VOP3__V_BFE_U32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u>>S1.u[4:0]) & ((1<<S2.u[4:0])-1).
-    // Bitfield extract with S0 = data, S1 = field_offset, S2 = field_width.
-    void
-    Inst_VOP3__V_BFE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (src0[lane] >> bits(src1[lane], 4, 0))
-                    & ((1 << bits(src2[lane], 4, 0)) - 1);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_BFE_I32 class methods ---
-
-    Inst_VOP3__V_BFE_I32::Inst_VOP3__V_BFE_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_bfe_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_BFE_I32
-
-    Inst_VOP3__V_BFE_I32::~Inst_VOP3__V_BFE_I32()
-    {
-    } // ~Inst_VOP3__V_BFE_I32
-
-    // --- description from .arch file ---
-    // D.i = (S0.i>>S1.u[4:0]) & ((1<<S2.u[4:0])-1).
-    // Bitfield extract with S0 = data, S1 = field_offset, S2 = field_width.
-    void
-    Inst_VOP3__V_BFE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (src0[lane] >> bits(src1[lane], 4, 0))
-                    & ((1 << bits(src2[lane], 4, 0)) - 1);
-
-                // Above extracted a signed int of size src2 bits which needs
-                // to be signed-extended. Check if the MSB of our src2-bit
-                // integer is 1, and sign extend it is.
-                if (vdst[lane] >> (bits(src2[lane], 4, 0) - 1)) {
-                    vdst[lane] |= 0xffffffff << bits(src2[lane], 4, 0);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_BFI_B32 class methods ---
-
-    Inst_VOP3__V_BFI_B32::Inst_VOP3__V_BFI_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_bfi_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_BFI_B32
-
-    Inst_VOP3__V_BFI_B32::~Inst_VOP3__V_BFI_B32()
-    {
-    } // ~Inst_VOP3__V_BFI_B32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u & S1.u) | (~S0.u & S2.u); bitfield insert.
-    void
-    Inst_VOP3__V_BFI_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (src0[lane] & src1[lane]) | (~src0[lane]
-                    & src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FMA_F32 class methods ---
-
-    Inst_VOP3__V_FMA_F32::Inst_VOP3__V_FMA_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_fma_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(FMA);
-    } // Inst_VOP3__V_FMA_F32
-
-    Inst_VOP3__V_FMA_F32::~Inst_VOP3__V_FMA_F32()
-    {
-    } // ~Inst_VOP3__V_FMA_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f + S2.f.
-    void
-    Inst_VOP3__V_FMA_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FMA_F64 class methods ---
-
-    Inst_VOP3__V_FMA_F64::Inst_VOP3__V_FMA_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_fma_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(FMA);
-    } // Inst_VOP3__V_FMA_F64
-
-    Inst_VOP3__V_FMA_F64::~Inst_VOP3__V_FMA_F64()
-    {
-    } // ~Inst_VOP3__V_FMA_F64
-
-    // --- description from .arch file ---
-    // D.d = S0.d * S1.d + S2.d.
-    void
-    Inst_VOP3__V_FMA_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LERP_U8 class methods ---
-
-    Inst_VOP3__V_LERP_U8::Inst_VOP3__V_LERP_U8(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_lerp_u8", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LERP_U8
-
-    Inst_VOP3__V_LERP_U8::~Inst_VOP3__V_LERP_U8()
-    {
-    } // ~Inst_VOP3__V_LERP_U8
-
-    // --- description from .arch file ---
-    // D.u = ((S0.u[31:24] + S1.u[31:24] + S2.u[24]) >> 1) << 24
-    // D.u += ((S0.u[23:16] + S1.u[23:16] + S2.u[16]) >> 1) << 16;
-    // D.u += ((S0.u[15:8] + S1.u[15:8] + S2.u[8]) >> 1) << 8;
-    // D.u += ((S0.u[7:0] + S1.u[7:0] + S2.u[0]) >> 1).
-    // Unsigned 8-bit pixel average on packed unsigned bytes (linear
-    // ---  interpolation). S2 acts as a round mode; if set, 0.5 rounds up,
-    // ---  otherwise 0.5 truncates.
-    void
-    Inst_VOP3__V_LERP_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = ((bits(src0[lane], 31, 24)
-                    + bits(src1[lane], 31, 24) + bits(src2[lane], 24)) >> 1)
-                        << 24;
-                vdst[lane] += ((bits(src0[lane], 23, 16)
-                    + bits(src1[lane], 23, 16) + bits(src2[lane], 16)) >> 1)
-                        << 16;
-                vdst[lane] += ((bits(src0[lane], 15, 8)
-                    + bits(src1[lane], 15, 8) + bits(src2[lane], 8)) >> 1)
-                        << 8;
-                vdst[lane] += ((bits(src0[lane], 7, 0) + bits(src1[lane], 7, 0)
-                    + bits(src2[lane], 0)) >> 1);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ALIGNBIT_B32 class methods ---
-
-    Inst_VOP3__V_ALIGNBIT_B32::Inst_VOP3__V_ALIGNBIT_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_alignbit_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ALIGNBIT_B32
-
-    Inst_VOP3__V_ALIGNBIT_B32::~Inst_VOP3__V_ALIGNBIT_B32()
-    {
-    } // ~Inst_VOP3__V_ALIGNBIT_B32
-
-    // --- description from .arch file ---
-    // D.u = ({S0,S1} >> S2.u[4:0]) & 0xffffffff.
-    void
-    Inst_VOP3__V_ALIGNBIT_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemU64 src_0_1 = (((VecElemU64)src0[lane] << 32)
-                    | (VecElemU64)src1[lane]);
-                vdst[lane] = (VecElemU32)((src_0_1
-                    >> (VecElemU64)bits(src2[lane], 4, 0)) & 0xffffffff);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ALIGNBYTE_B32 class methods ---
-
-    Inst_VOP3__V_ALIGNBYTE_B32::Inst_VOP3__V_ALIGNBYTE_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_alignbyte_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ALIGNBYTE_B32
-
-    Inst_VOP3__V_ALIGNBYTE_B32::~Inst_VOP3__V_ALIGNBYTE_B32()
-    {
-    } // ~Inst_VOP3__V_ALIGNBYTE_B32
-
-    // --- description from .arch file ---
-    // D.u = ({S0,S1} >> (8*S2.u[4:0])) & 0xffffffff.
-    void
-    Inst_VOP3__V_ALIGNBYTE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemU64 src_0_1 = (((VecElemU64)src0[lane] << 32)
-                    | (VecElemU64)src1[lane]);
-                vdst[lane] = (VecElemU32)((src_0_1
-                    >> (8ULL * (VecElemU64)bits(src2[lane], 4, 0)))
-                        & 0xffffffff);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MIN3_F32 class methods ---
-
-    Inst_VOP3__V_MIN3_F32::Inst_VOP3__V_MIN3_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min3_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_MIN3_F32
-
-    Inst_VOP3__V_MIN3_F32::~Inst_VOP3__V_MIN3_F32()
-    {
-    } // ~Inst_VOP3__V_MIN3_F32
-
-    // --- description from .arch file ---
-    // D.f = min(S0.f, S1.f, S2.f).
-    void
-    Inst_VOP3__V_MIN3_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemF32 min_0_1 = std::fmin(src0[lane], src1[lane]);
-                vdst[lane] = std::fmin(min_0_1, src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MIN3_I32 class methods ---
-
-    Inst_VOP3__V_MIN3_I32::Inst_VOP3__V_MIN3_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min3_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MIN3_I32
-
-    Inst_VOP3__V_MIN3_I32::~Inst_VOP3__V_MIN3_I32()
-    {
-    } // ~Inst_VOP3__V_MIN3_I32
-
-    // --- description from .arch file ---
-    // D.i = min(S0.i, S1.i, S2.i).
-    void
-    Inst_VOP3__V_MIN3_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI32 min_0_1 = std::min(src0[lane], src1[lane]);
-                vdst[lane] = std::min(min_0_1, src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MIN3_U32 class methods ---
-
-    Inst_VOP3__V_MIN3_U32::Inst_VOP3__V_MIN3_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min3_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MIN3_U32
-
-    Inst_VOP3__V_MIN3_U32::~Inst_VOP3__V_MIN3_U32()
-    {
-    } // ~Inst_VOP3__V_MIN3_U32
-
-    // --- description from .arch file ---
-    // D.u = min(S0.u, S1.u, S2.u).
-    void
-    Inst_VOP3__V_MIN3_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemU32 min_0_1 = std::min(src0[lane], src1[lane]);
-                vdst[lane] = std::min(min_0_1, src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAX3_F32 class methods ---
-
-    Inst_VOP3__V_MAX3_F32::Inst_VOP3__V_MAX3_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max3_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_MAX3_F32
-
-    Inst_VOP3__V_MAX3_F32::~Inst_VOP3__V_MAX3_F32()
-    {
-    } // ~Inst_VOP3__V_MAX3_F32
-
-    // --- description from .arch file ---
-    // D.f = max(S0.f, S1.f, S2.f).
-    void
-    Inst_VOP3__V_MAX3_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemF32 max_0_1 = std::fmax(src0[lane], src1[lane]);
-                vdst[lane] = std::fmax(max_0_1, src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAX3_I32 class methods ---
-
-    Inst_VOP3__V_MAX3_I32::Inst_VOP3__V_MAX3_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max3_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MAX3_I32
-
-    Inst_VOP3__V_MAX3_I32::~Inst_VOP3__V_MAX3_I32()
-    {
-    } // ~Inst_VOP3__V_MAX3_I32
-
-    // --- description from .arch file ---
-    // D.i = max(S0.i, S1.i, S2.i).
-    void
-    Inst_VOP3__V_MAX3_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI32 max_0_1 = std::max(src0[lane], src1[lane]);
-                vdst[lane] = std::max(max_0_1, src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAX3_U32 class methods ---
-
-    Inst_VOP3__V_MAX3_U32::Inst_VOP3__V_MAX3_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max3_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MAX3_U32
-
-    Inst_VOP3__V_MAX3_U32::~Inst_VOP3__V_MAX3_U32()
-    {
-    } // ~Inst_VOP3__V_MAX3_U32
-
-    // --- description from .arch file ---
-    // D.u = max(S0.u, S1.u, S2.u).
-    void
-    Inst_VOP3__V_MAX3_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemU32 max_0_1 = std::max(src0[lane], src1[lane]);
-                vdst[lane] = std::max(max_0_1, src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MED3_F32 class methods ---
-
-    Inst_VOP3__V_MED3_F32::Inst_VOP3__V_MED3_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_med3_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_MED3_F32
-
-    Inst_VOP3__V_MED3_F32::~Inst_VOP3__V_MED3_F32()
-    {
-    } // ~Inst_VOP3__V_MED3_F32
-
-    // --- description from .arch file ---
-    // D.f = median(S0.f, S1.f, S2.f).
-    void
-    Inst_VOP3__V_MED3_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MED3_I32 class methods ---
-
-    Inst_VOP3__V_MED3_I32::Inst_VOP3__V_MED3_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_med3_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MED3_I32
-
-    Inst_VOP3__V_MED3_I32::~Inst_VOP3__V_MED3_I32()
-    {
-    } // ~Inst_VOP3__V_MED3_I32
-
-    // --- description from .arch file ---
-    // D.i = median(S0.i, S1.i, S2.i).
-    void
-    Inst_VOP3__V_MED3_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MED3_U32 class methods ---
-
-    Inst_VOP3__V_MED3_U32::Inst_VOP3__V_MED3_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_med3_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MED3_U32
-
-    Inst_VOP3__V_MED3_U32::~Inst_VOP3__V_MED3_U32()
-    {
-    } // ~Inst_VOP3__V_MED3_U32
-
-    // --- description from .arch file ---
-    // D.u = median(S0.u, S1.u, S2.u).
-    void
-    Inst_VOP3__V_MED3_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SAD_U8 class methods ---
-
-    Inst_VOP3__V_SAD_U8::Inst_VOP3__V_SAD_U8(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sad_u8", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SAD_U8
-
-    Inst_VOP3__V_SAD_U8::~Inst_VOP3__V_SAD_U8()
-    {
-    } // ~Inst_VOP3__V_SAD_U8
-
-    // --- description from .arch file ---
-    // D.u = abs(S0.i[31:24] - S1.i[31:24]) + abs(S0.i[23:16] - S1.i[23:16]) +
-    // abs(S0.i[15:8] - S1.i[15:8]) + abs(S0.i[7:0] - S1.i[7:0]) + S2.u.
-    // Sum of absolute differences with accumulation, overflow into upper bits
-    // is allowed.
-    void
-    Inst_VOP3__V_SAD_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::abs(bits(src0[lane], 31, 24)
-                    - bits(src1[lane], 31, 24))
-                    + std::abs(bits(src0[lane], 23, 16)
-                    - bits(src1[lane], 23, 16))
-                    + std::abs(bits(src0[lane], 15, 8)
-                    - bits(src1[lane], 15, 8))
-                    + std::abs(bits(src0[lane], 7, 0)
-                    - bits(src1[lane], 7, 0)) + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SAD_HI_U8 class methods ---
-
-    Inst_VOP3__V_SAD_HI_U8::Inst_VOP3__V_SAD_HI_U8(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sad_hi_u8", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SAD_HI_U8
-
-    Inst_VOP3__V_SAD_HI_U8::~Inst_VOP3__V_SAD_HI_U8()
-    {
-    } // ~Inst_VOP3__V_SAD_HI_U8
-
-    // --- description from .arch file ---
-    // D.u = (SAD_U8(S0, S1, 0) << 16) + S2.u.
-    // Sum of absolute differences with accumulation, overflow is lost.
-    void
-    Inst_VOP3__V_SAD_HI_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (((bits(src0[lane], 31, 24)
-                    - bits(src1[lane], 31, 24)) + (bits(src0[lane], 23, 16)
-                    - bits(src1[lane], 23, 16)) + (bits(src0[lane], 15, 8)
-                    - bits(src1[lane], 15, 8)) + (bits(src0[lane], 7, 0)
-                    - bits(src1[lane], 7, 0))) << 16) + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SAD_U16 class methods ---
-
-    Inst_VOP3__V_SAD_U16::Inst_VOP3__V_SAD_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sad_u16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SAD_U16
-
-    Inst_VOP3__V_SAD_U16::~Inst_VOP3__V_SAD_U16()
-    {
-    } // ~Inst_VOP3__V_SAD_U16
-
-    // --- description from .arch file ---
-    // D.u = abs(S0.i[31:16] - S1.i[31:16]) + abs(S0.i[15:0] - S1.i[15:0])
-    // + S2.u.
-    // Word SAD with accumulation.
-    void
-    Inst_VOP3__V_SAD_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::abs(bits(src0[lane], 31, 16)
-                    - bits(src1[lane], 31, 16))
-                    + std::abs(bits(src0[lane], 15, 0)
-                    - bits(src1[lane], 15, 0)) + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SAD_U32 class methods ---
-
-    Inst_VOP3__V_SAD_U32::Inst_VOP3__V_SAD_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sad_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SAD_U32
-
-    Inst_VOP3__V_SAD_U32::~Inst_VOP3__V_SAD_U32()
-    {
-    } // ~Inst_VOP3__V_SAD_U32
-
-    // --- description from .arch file ---
-    // D.u = abs(S0.i - S1.i) + S2.u.
-    // Dword SAD with accumulation.
-    void
-    Inst_VOP3__V_SAD_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::abs(src0[lane] - src1[lane]) + src2[lane];
-            } // if
-        } // for
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_PK_U8_F32 class methods ---
-
-    Inst_VOP3__V_CVT_PK_U8_F32::Inst_VOP3__V_CVT_PK_U8_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_pk_u8_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_PK_U8_F32
-
-    Inst_VOP3__V_CVT_PK_U8_F32::~Inst_VOP3__V_CVT_PK_U8_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_PK_U8_F32
-
-    // --- description from .arch file ---
-    // D.u = ((flt32_to_uint8(S0.f) & 0xff) << (8 * S1.u[1:0]))
-    // | (S2.u & ~(0xff << (8 * S1.u[1:0]))).
-    // Convert floating point value S0 to 8-bit unsigned integer and pack the
-    // result into byte S1 of dword S2.
-    void
-    Inst_VOP3__V_CVT_PK_U8_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (((VecElemU8)src0[lane] & 0xff)
-                    << (8 * bits(src1[lane], 1, 0)))
-                    | (src2[lane] & ~(0xff << (8 * bits(src1[lane], 1, 0))));
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_DIV_FIXUP_F32 class methods ---
-
-    Inst_VOP3__V_DIV_FIXUP_F32::Inst_VOP3__V_DIV_FIXUP_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_div_fixup_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_DIV_FIXUP_F32
-
-    Inst_VOP3__V_DIV_FIXUP_F32::~Inst_VOP3__V_DIV_FIXUP_F32()
-    {
-    } // ~Inst_VOP3__V_DIV_FIXUP_F32
-
-    // --- description from .arch file ---
-    // D.f = Divide fixup and flags -- s0.f = Quotient, s1.f = Denominator,
-    // s2.f = Numerator. This opcode generates exceptions resulting from the
-    // division operation.
-    void
-    Inst_VOP3__V_DIV_FIXUP_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::fpclassify(src1[lane]) == FP_ZERO) {
-                    if (std::signbit(src1[lane])) {
-                        vdst[lane] = -INFINITY;
-                    } else {
-                        vdst[lane] = +INFINITY;
-                    }
-                } else if (std::isnan(src2[lane]) || std::isnan(src1[lane])) {
-                    vdst[lane] = NAN;
-                } else if (std::isinf(src1[lane])) {
-                    if (std::signbit(src1[lane])) {
-                        vdst[lane] = -INFINITY;
-                    } else {
-                        vdst[lane] = +INFINITY;
-                    }
-                } else {
-                    vdst[lane] = src2[lane] / src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_DIV_FIXUP_F64 class methods ---
-
-    Inst_VOP3__V_DIV_FIXUP_F64::Inst_VOP3__V_DIV_FIXUP_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_div_fixup_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_DIV_FIXUP_F64
-
-    Inst_VOP3__V_DIV_FIXUP_F64::~Inst_VOP3__V_DIV_FIXUP_F64()
-    {
-    } // ~Inst_VOP3__V_DIV_FIXUP_F64
-
-    // --- description from .arch file ---
-    // D.d = Divide fixup and flags -- s0.d = Quotient, s1.d = Denominator,
-    // s2.d = Numerator. This opcode generates exceptions resulting from the
-    // division operation.
-    void
-    Inst_VOP3__V_DIV_FIXUP_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int sign_out = std::signbit(src1[lane])
-                              ^ std::signbit(src2[lane]);
-                int exp1(0);
-                int exp2(0);
-                std::frexp(src1[lane], &exp1);
-                std::frexp(src2[lane], &exp2);
-
-                if (std::isnan(src1[lane]) || std::isnan(src2[lane])) {
-                    vdst[lane] = std::numeric_limits<VecElemF64>::quiet_NaN();
-                } else if (std::fpclassify(src1[lane]) == FP_ZERO
-                           && std::fpclassify(src2[lane]) == FP_ZERO) {
-                    vdst[lane]
-                        = std::numeric_limits<VecElemF64>::signaling_NaN();
-                } else if (std::isinf(src1[lane]) && std::isinf(src2[lane])) {
-                    vdst[lane]
-                        = std::numeric_limits<VecElemF64>::signaling_NaN();
-                } else if (std::fpclassify(src1[lane]) == FP_ZERO
-                           || std::isinf(src2[lane])) {
-                    vdst[lane] = sign_out ? -INFINITY : +INFINITY;
-                } else if (std::isinf(src1[lane])
-                           || std::fpclassify(src2[lane]) == FP_ZERO) {
-                    vdst[lane] = sign_out ? -0.0 : +0.0;
-                } else if (exp2 - exp1 < -1075) {
-                    vdst[lane] = src0[lane];
-                } else if (exp1 == 2047) {
-                    vdst[lane] = src0[lane];
-                } else {
-                    vdst[lane] = sign_out ? -std::fabs(src0[lane])
-                        : std::fabs(src0[lane]);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_DIV_SCALE_F32 class methods ---
-
-    Inst_VOP3__V_DIV_SCALE_F32::Inst_VOP3__V_DIV_SCALE_F32(
-          InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_div_scale_f32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(F32);
-    } // Inst_VOP3__V_DIV_SCALE_F32
-
-    Inst_VOP3__V_DIV_SCALE_F32::~Inst_VOP3__V_DIV_SCALE_F32()
-    {
-    } // ~Inst_VOP3__V_DIV_SCALE_F32
-
-    // --- description from .arch file ---
-    // {vcc,D.f} = Divide preop and flags -- s0.f = Quotient, s1.f =
-    // Denominator, s2.f = Numerator -- s0 must equal s1 or s2. Given a
-    // numerator and denominator, this opcode will appropriately scale inputs
-    // for division to avoid subnormal terms during Newton-Raphson correction
-    // algorithm. This opcode producses a VCC flag for post-scale of quotient.
-    void
-    Inst_VOP3__V_DIV_SCALE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane];
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_DIV_SCALE_F64 class methods ---
-
-    Inst_VOP3__V_DIV_SCALE_F64::Inst_VOP3__V_DIV_SCALE_F64(
-          InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_div_scale_f64")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(F64);
-    } // Inst_VOP3__V_DIV_SCALE_F64
-
-    Inst_VOP3__V_DIV_SCALE_F64::~Inst_VOP3__V_DIV_SCALE_F64()
-    {
-    } // ~Inst_VOP3__V_DIV_SCALE_F64
-
-    // --- description from .arch file ---
-    // {vcc,D.d} = Divide preop and flags -- s0.d = Quotient, s1.d =
-    // Denominator, s2.d = Numerator -- s0 must equal s1 or s2. Given a
-    // numerator and denominator, this opcode will appropriately scale inputs
-    // for division to avoid subnormal terms during Newton-Raphson correction
-    // algorithm. This opcode producses a VCC flag for post-scale of quotient.
-    void
-    Inst_VOP3__V_DIV_SCALE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
-        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp1(0);
-                int exp2(0);
-                std::frexp(src1[lane], &exp1);
-                std::frexp(src2[lane], &exp2);
-                vcc.setBit(lane, 0);
-
-                if (std::fpclassify(src1[lane]) == FP_ZERO
-                    || std::fpclassify(src2[lane]) == FP_ZERO) {
-                    vdst[lane] = NAN;
-                } else if (exp2 - exp1 >= 768) {
-                    vcc.setBit(lane, 1);
-                    if (src0[lane] == src1[lane]) {
-                        vdst[lane] = std::ldexp(src0[lane], 128);
-                    }
-                } else if (std::fpclassify(src1[lane]) == FP_SUBNORMAL) {
-                    vdst[lane] = std::ldexp(src0[lane], 128);
-                } else if (std::fpclassify(1.0 / src1[lane]) == FP_SUBNORMAL
-                           && std::fpclassify(src2[lane] / src1[lane])
-                           == FP_SUBNORMAL) {
-                    vcc.setBit(lane, 1);
-                    if (src0[lane] == src1[lane]) {
-                        vdst[lane] = std::ldexp(src0[lane], 128);
-                    }
-                } else if (std::fpclassify(1.0 / src1[lane]) == FP_SUBNORMAL) {
-                    vdst[lane] = std::ldexp(src0[lane], -128);
-                } else if (std::fpclassify(src2[lane] / src1[lane])
-                           == FP_SUBNORMAL) {
-                    vcc.setBit(lane, 1);
-                    if (src0[lane] == src2[lane]) {
-                        vdst[lane] = std::ldexp(src0[lane], 128);
-                    }
-                } else if (exp2 <= 53) {
-                    vdst[lane] = std::ldexp(src0[lane], 128);
-                }
-            }
-        }
-
-        vcc.write();
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_DIV_FMAS_F32 class methods ---
-
-    Inst_VOP3__V_DIV_FMAS_F32::Inst_VOP3__V_DIV_FMAS_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_div_fmas_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(ReadsVCC);
-        setFlag(F32);
-        setFlag(FMA);
-    } // Inst_VOP3__V_DIV_FMAS_F32
-
-    Inst_VOP3__V_DIV_FMAS_F32::~Inst_VOP3__V_DIV_FMAS_F32()
-    {
-    } // ~Inst_VOP3__V_DIV_FMAS_F32
-
-    // --- description from .arch file ---
-    // D.f = Special case divide FMA with scale and flags(s0.f = Quotient,
-    // s1.f = Denominator, s2.f = Numerator)
-    void
-    Inst_VOP3__V_DIV_FMAS_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        //vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_DIV_FMAS_F64 class methods ---
-
-    Inst_VOP3__V_DIV_FMAS_F64::Inst_VOP3__V_DIV_FMAS_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_div_fmas_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(ReadsVCC);
-        setFlag(F64);
-        setFlag(FMA);
-    } // Inst_VOP3__V_DIV_FMAS_F64
-
-    Inst_VOP3__V_DIV_FMAS_F64::~Inst_VOP3__V_DIV_FMAS_F64()
-    {
-    } // ~Inst_VOP3__V_DIV_FMAS_F64
-
-    // --- description from .arch file ---
-    // D.d = Special case divide FMA with scale and flags(s0.d = Quotient,
-    // s1.d = Denominator, s2.d = Numerator)
-    void
-    Inst_VOP3__V_DIV_FMAS_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-        vcc.read();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(vcc.rawData(), lane)) {
-                    vdst[lane] = std::pow(2, 64)
-                        * std::fma(src0[lane], src1[lane], src2[lane]);
-                } else {
-                    vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MSAD_U8 class methods ---
-
-    Inst_VOP3__V_MSAD_U8::Inst_VOP3__V_MSAD_U8(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_msad_u8", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MSAD_U8
-
-    Inst_VOP3__V_MSAD_U8::~Inst_VOP3__V_MSAD_U8()
-    {
-    } // ~Inst_VOP3__V_MSAD_U8
-
-    // --- description from .arch file ---
-    // D.u = Masked Byte SAD with accum_lo(S0.u, S1.u, S2.u).
-    void
-    Inst_VOP3__V_MSAD_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_QSAD_PK_U16_U8 class methods ---
-
-    Inst_VOP3__V_QSAD_PK_U16_U8::Inst_VOP3__V_QSAD_PK_U16_U8(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_qsad_pk_u16_u8", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_QSAD_PK_U16_U8
-
-    Inst_VOP3__V_QSAD_PK_U16_U8::~Inst_VOP3__V_QSAD_PK_U16_U8()
-    {
-    } // ~Inst_VOP3__V_QSAD_PK_U16_U8
-
-    // --- description from .arch file ---
-    // D.u = Quad-Byte SAD with 16-bit packed accum_lo/hi(S0.u[63:0],
-    // S1.u[31:0], S2.u[63:0])
-    void
-    Inst_VOP3__V_QSAD_PK_U16_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_MQSAD_PK_U16_U8 class methods ---
-
-    Inst_VOP3__V_MQSAD_PK_U16_U8::Inst_VOP3__V_MQSAD_PK_U16_U8(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mqsad_pk_u16_u8", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MQSAD_PK_U16_U8
-
-    Inst_VOP3__V_MQSAD_PK_U16_U8::~Inst_VOP3__V_MQSAD_PK_U16_U8()
-    {
-    } // ~Inst_VOP3__V_MQSAD_PK_U16_U8
-
-    // --- description from .arch file ---
-    // D.u = Masked Quad-Byte SAD with 16-bit packed accum_lo/hi(S0.u[63:0],
-    // ---  S1.u[31:0], S2.u[63:0])
-    void
-    Inst_VOP3__V_MQSAD_PK_U16_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_MQSAD_U32_U8 class methods ---
-
-    Inst_VOP3__V_MQSAD_U32_U8::Inst_VOP3__V_MQSAD_U32_U8(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mqsad_u32_u8", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MQSAD_U32_U8
-
-    Inst_VOP3__V_MQSAD_U32_U8::~Inst_VOP3__V_MQSAD_U32_U8()
-    {
-    } // ~Inst_VOP3__V_MQSAD_U32_U8
-
-    // --- description from .arch file ---
-    // D.u128 = Masked Quad-Byte SAD with 32-bit accum_lo/hi(S0.u[63:0],
-    // ---  S1.u[31:0], S2.u[127:0])
-    void
-    Inst_VOP3__V_MQSAD_U32_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_MAD_U64_U32 class methods ---
-
-    Inst_VOP3__V_MAD_U64_U32::Inst_VOP3__V_MAD_U64_U32(
-          InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_mad_u64_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_U64_U32
-
-    Inst_VOP3__V_MAD_U64_U32::~Inst_VOP3__V_MAD_U64_U32()
-    {
-    } // ~Inst_VOP3__V_MAD_U64_U32
-
-    // --- description from .arch file ---
-    // {vcc_out,D.u64} = S0.u32 * S1.u32 + S2.u64.
-    void
-    Inst_VOP3__V_MAD_U64_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
-        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
-        VecOperandU64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-        vdst.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, muladd(vdst[lane], src0[lane], src1[lane],
-                    src2[lane]));
-            }
-        }
-
-        vcc.write();
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAD_I64_I32 class methods ---
-
-    Inst_VOP3__V_MAD_I64_I32::Inst_VOP3__V_MAD_I64_I32(
-          InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_mad_i64_i32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_I64_I32
-
-    Inst_VOP3__V_MAD_I64_I32::~Inst_VOP3__V_MAD_I64_I32()
-    {
-    } // ~Inst_VOP3__V_MAD_I64_I32
-
-    // --- description from .arch file ---
-    // {vcc_out,D.i64} = S0.i32 * S1.i32 + S2.i64.
-    void
-    Inst_VOP3__V_MAD_I64_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandI64 src2(gpuDynInst, extData.SRC2);
-        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
-        VecOperandI64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, muladd(vdst[lane], src0[lane], src1[lane],
-                    src2[lane]));
-            }
-        }
-
-        vcc.write();
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_XAD_U32 class methods ---
-
-    Inst_VOP3__V_XAD_U32::Inst_VOP3__V_XAD_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_xad_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_XAD_U32
-
-    Inst_VOP3__V_XAD_U32::~Inst_VOP3__V_XAD_U32()
-    {
-    } // ~Inst_VOP3__V_XAD_U32
-
-    // --- description from .arch file ---
-    // D.u32 = (S0.u32 ^ S1.u32) + S2.u32.
-    void
-    Inst_VOP3__V_XAD_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (src0[lane] ^ src1[lane]) + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LSHL_ADD_U32 class methods ---
-
-    Inst_VOP3__V_LSHL_ADD_U32::Inst_VOP3__V_LSHL_ADD_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_lshl_add_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHL_ADD_U32
-
-    Inst_VOP3__V_LSHL_ADD_U32::~Inst_VOP3__V_LSHL_ADD_U32()
-    {
-    } // ~Inst_VOP3__V_LSHL_ADD_U32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u << S1.u[4:0]) + S2.u.
-    void
-    Inst_VOP3__V_LSHL_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (src0[lane] << bits(src1[lane], 4, 0))
-                           + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ADD_LSHL_U32 class methods ---
-
-    Inst_VOP3__V_ADD_LSHL_U32::Inst_VOP3__V_ADD_LSHL_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_add_lshl_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ADD_LSHL_U32
-
-    Inst_VOP3__V_ADD_LSHL_U32::~Inst_VOP3__V_ADD_LSHL_U32()
-    {
-    } // ~Inst_VOP3__V_ADD_LSHL_U32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u + S1.u) << S2.u[4:0].
-    void
-    Inst_VOP3__V_ADD_LSHL_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] =
-                    (src0[lane] + src1[lane]) << bits(src2[lane], 4, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ADD3_U32 class methods ---
-
-    Inst_VOP3__V_ADD3_U32::Inst_VOP3__V_ADD3_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_add3_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ADD3_U32
-
-    Inst_VOP3__V_ADD3_U32::~Inst_VOP3__V_ADD3_U32()
-    {
-    } // ~Inst_VOP3__V_ADD3_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u + S1.u + S2.u.
-    void
-    Inst_VOP3__V_ADD3_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane] + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LSHL_OR_B32 class methods ---
-
-    Inst_VOP3__V_LSHL_OR_B32::Inst_VOP3__V_LSHL_OR_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_lshl_or_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHL_OR_B32
-
-    Inst_VOP3__V_LSHL_OR_B32::~Inst_VOP3__V_LSHL_OR_B32()
-    {
-    } // ~Inst_VOP3__V_LSHL_OR_B32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u << S1.u[4:0]) | S2.u.
-    void
-    Inst_VOP3__V_LSHL_OR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (src0[lane] << bits(src1[lane], 4, 0))
-                           | src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_AND_OR_B32 class methods ---
-
-    Inst_VOP3__V_AND_OR_B32::Inst_VOP3__V_AND_OR_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_and_or_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_AND_OR_B32
-
-    Inst_VOP3__V_AND_OR_B32::~Inst_VOP3__V_AND_OR_B32()
-    {
-    } // ~Inst_VOP3__V_AND_OR_B32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u & S1.u) | S2.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP3__V_AND_OR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (src0[lane] & src1[lane]) | src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAD_F16 class methods ---
-
-    Inst_VOP3__V_MAD_F16::Inst_VOP3__V_MAD_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mad_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_F16
-
-    Inst_VOP3__V_MAD_F16::~Inst_VOP3__V_MAD_F16()
-    {
-    } // ~Inst_VOP3__V_MAD_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * S1.f16 + S2.f16.
-    // Supports round mode, exception flags, saturation.
-    void
-    Inst_VOP3__V_MAD_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_MAD_U16 class methods ---
-
-    Inst_VOP3__V_MAD_U16::Inst_VOP3__V_MAD_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mad_u16", false)
-    {
-        setFlag(ALU);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_U16
-
-    Inst_VOP3__V_MAD_U16::~Inst_VOP3__V_MAD_U16()
-    {
-    } // ~Inst_VOP3__V_MAD_U16
-
-    // --- description from .arch file ---
-    // D.u16 = S0.u16 * S1.u16 + S2.u16.
-    // Supports saturation (unsigned 16-bit integer domain).
-    void
-    Inst_VOP3__V_MAD_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU16 src2(gpuDynInst, extData.SRC2);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] * src1[lane] + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAD_I16 class methods ---
-
-    Inst_VOP3__V_MAD_I16::Inst_VOP3__V_MAD_I16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mad_i16", false)
-    {
-        setFlag(ALU);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_I16
-
-    Inst_VOP3__V_MAD_I16::~Inst_VOP3__V_MAD_I16()
-    {
-    } // ~Inst_VOP3__V_MAD_I16
-
-    // --- description from .arch file ---
-    // D.i16 = S0.i16 * S1.i16 + S2.i16.
-    // Supports saturation (signed 16-bit integer domain).
-    void
-    Inst_VOP3__V_MAD_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandI16 src2(gpuDynInst, extData.SRC2);
-        VecOperandI16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] * src1[lane] + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_PERM_B32 class methods ---
-
-    Inst_VOP3__V_PERM_B32::Inst_VOP3__V_PERM_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_perm_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_PERM_B32
-
-    Inst_VOP3__V_PERM_B32::~Inst_VOP3__V_PERM_B32()
-    {
-    } // ~Inst_VOP3__V_PERM_B32
-
-    // --- description from .arch file ---
-    // D.u[31:24] = permute({S0.u, S1.u}, S2.u[31:24]);
-    // D.u[23:16] = permute({S0.u, S1.u}, S2.u[23:16]);
-    // D.u[15:8] = permute({S0.u, S1.u}, S2.u[15:8]);
-    // D.u[7:0] = permute({S0.u, S1.u}, S2.u[7:0]);
-    // byte permute(byte in[8], byte sel) {
-    //     if(sel>=13) then return 0xff;
-    //     elsif(sel==12) then return 0x00;
-    //     elsif(sel==11) then return in[7][7] * 0xff;
-    //     elsif(sel==10) then return in[5][7] * 0xff;
-    //     elsif(sel==9) then return in[3][7] * 0xff;
-    //     elsif(sel==8) then return in[1][7] * 0xff;
-    //     else return in[sel];
-    //     }
-    // Byte permute.
-    void
-    Inst_VOP3__V_PERM_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemU64 selector = (VecElemU64)src0[lane];
-                selector = (selector << 32) | (VecElemU64)src1[lane];
-                vdst[lane] = 0;
-
-                DPRINTF(VEGA, "Executing v_perm_b32 src_0 0x%08x, src_1 "
-                        "0x%08x, src_2 0x%08x, vdst 0x%08x\n", src0[lane],
-                        src1[lane], src2[lane], vdst[lane]);
-                DPRINTF(VEGA, "Selector: 0x%08x \n", selector);
-
-                for (int i = 0; i < 4 ; ++i) {
-                    VecElemU32 permuted_val = permute(selector, 0xFF
-                        & ((VecElemU32)src2[lane] >> (8 * i)));
-                    vdst[lane] |= (permuted_val << (8 * i));
-                }
-
-                DPRINTF(VEGA, "v_perm result: 0x%08x\n", vdst[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FMA_F16 class methods ---
-
-    Inst_VOP3__V_FMA_F16::Inst_VOP3__V_FMA_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_fma_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(FMA);
-    } // Inst_VOP3__V_FMA_F16
-
-    Inst_VOP3__V_FMA_F16::~Inst_VOP3__V_FMA_F16()
-    {
-    } // ~Inst_VOP3__V_FMA_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * S1.f16 + S2.f16.
-    // Fused half precision multiply add.
-    void
-    Inst_VOP3__V_FMA_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_DIV_FIXUP_F16 class methods ---
-
-    Inst_VOP3__V_DIV_FIXUP_F16::Inst_VOP3__V_DIV_FIXUP_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_div_fixup_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_DIV_FIXUP_F16
-
-    Inst_VOP3__V_DIV_FIXUP_F16::~Inst_VOP3__V_DIV_FIXUP_F16()
-    {
-    } // ~Inst_VOP3__V_DIV_FIXUP_F16
-
-    // --- description from .arch file ---
-    // sign_out =  sign(S1.f16)^sign(S2.f16);
-    // if (S2.f16 == NAN)
-    //     D.f16 = Quiet(S2.f16);
-    // else if (S1.f16 == NAN)
-    //     D.f16 = Quiet(S1.f16);
-    // else if (S1.f16 == S2.f16 == 0)
-    //     # 0/0
-    //     D.f16 = pele_nan(0xfe00);
-    // else if (abs(S1.f16) == abs(S2.f16) == +-INF)
-    //     # inf/inf
-    //     D.f16 = pele_nan(0xfe00);
-    // else if (S1.f16 ==0 || abs(S2.f16) == +-INF)
-    //     # x/0, or inf/y
-    //     D.f16 = sign_out ? -INF : INF;
-    // else if (abs(S1.f16) == +-INF || S2.f16 == 0)
-    //     # x/inf, 0/y
-    //     D.f16 = sign_out ? -0 : 0;
-    // else if ((exp(S2.f16) - exp(S1.f16)) < -150)
-    //     D.f16 = sign_out ? -underflow : underflow;
-    // else if (exp(S1.f16) == 255)
-    //     D.f16 = sign_out ? -overflow : overflow;
-    // else
-    //     D.f16 = sign_out ? -abs(S0.f16) : abs(S0.f16).
-    // Half precision division fixup.
-    // S0 = Quotient, S1 = Denominator, S3 = Numerator.
-    // Given a numerator, denominator, and quotient from a divide, this opcode
-    // will detect and apply special case numerics, touching up the quotient if
-    // necessary. This opcode also generates invalid, denorm and divide by
-    // zero exceptions caused by the division.
-    void
-    Inst_VOP3__V_DIV_FIXUP_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_PKACCUM_U8_F32 class methods ---
-
-    Inst_VOP3__V_CVT_PKACCUM_U8_F32::Inst_VOP3__V_CVT_PKACCUM_U8_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_pkaccum_u8_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_PKACCUM_U8_F32
-
-    Inst_VOP3__V_CVT_PKACCUM_U8_F32::~Inst_VOP3__V_CVT_PKACCUM_U8_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_PKACCUM_U8_F32
-
-    // --- description from .arch file ---
-    // byte = S1.u[1:0]; bit = byte * 8;
-    // D.u[bit+7:bit] = flt32_to_uint8(S0.f);
-    // Pack converted value of S0.f into byte S1 of the destination.
-    // SQ translates to V_CVT_PK_U8_F32.
-    // Note: this opcode uses src_c to pass destination in as a source.
-    void
-    Inst_VOP3__V_CVT_PKACCUM_U8_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_INTERP_P1_F32 class methods ---
-
-    Inst_VOP3__V_INTERP_P1_F32::Inst_VOP3__V_INTERP_P1_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_interp_p1_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_INTERP_P1_F32
-
-    Inst_VOP3__V_INTERP_P1_F32::~Inst_VOP3__V_INTERP_P1_F32()
-    {
-    } // ~Inst_VOP3__V_INTERP_P1_F32
-
-    // --- description from .arch file ---
-    // D.f = P10 * S.f + P0; parameter interpolation (SQ translates to
-    // V_MAD_F32 for SP).
-    // CAUTION: when in HALF_LDS mode, D must not be the same GPR as S; if
-    // D == S then data corruption will occur.
-    // NOTE: In textual representations the I/J VGPR is the first source and
-    // the attribute is the second source; however in the VOP3 encoding the
-    // attribute is stored in the src0 field and the VGPR is stored in the
-    // src1 field.
-    void
-    Inst_VOP3__V_INTERP_P1_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_INTERP_P2_F32 class methods ---
-
-    Inst_VOP3__V_INTERP_P2_F32::Inst_VOP3__V_INTERP_P2_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_interp_p2_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_INTERP_P2_F32
-
-    Inst_VOP3__V_INTERP_P2_F32::~Inst_VOP3__V_INTERP_P2_F32()
-    {
-    } // ~Inst_VOP3__V_INTERP_P2_F32
-
-    // --- description from .arch file ---
-    // D.f = P20 * S.f + D.f; parameter interpolation (SQ translates to
-    // V_MAD_F32 for SP).
-    // NOTE: In textual representations the I/J VGPR is the first source and
-    // the attribute is the second source; however in the VOP3 encoding the
-    // attribute is stored in the src0 field and the VGPR is stored in the
-    // src1 field.
-    void
-    Inst_VOP3__V_INTERP_P2_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_INTERP_MOV_F32 class methods ---
-
-    Inst_VOP3__V_INTERP_MOV_F32::Inst_VOP3__V_INTERP_MOV_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_interp_mov_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_INTERP_MOV_F32
-
-    Inst_VOP3__V_INTERP_MOV_F32::~Inst_VOP3__V_INTERP_MOV_F32()
-    {
-    } // ~Inst_VOP3__V_INTERP_MOV_F32
-
-    // --- description from .arch file ---
-    // D.f = {P10,P20,P0}[S.u]; parameter load.
-    void
-    Inst_VOP3__V_INTERP_MOV_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_INTERP_P1LL_F16 class methods ---
-
-    Inst_VOP3__V_INTERP_P1LL_F16::Inst_VOP3__V_INTERP_P1LL_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_interp_p1ll_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_INTERP_P1LL_F16
-
-    Inst_VOP3__V_INTERP_P1LL_F16::~Inst_VOP3__V_INTERP_P1LL_F16()
-    {
-    } // ~Inst_VOP3__V_INTERP_P1LL_F16
-
-    // --- description from .arch file ---
-    // D.f32 = P10.f16 * S0.f32 + P0.f16.
-    // 'LL' stands for 'two LDS arguments'.
-    // attr_word selects the high or low half 16 bits of each LDS dword
-    // accessed.
-    // This opcode is available for 32-bank LDS only.
-    // NOTE: In textual representations the I/J VGPR is the first source and
-    // the attribute is the second source; however in the VOP3 encoding the
-    // attribute is stored in the src0 field and the VGPR is stored in the
-    // src1 field.
-    void
-    Inst_VOP3__V_INTERP_P1LL_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_INTERP_P1LV_F16 class methods ---
-
-    Inst_VOP3__V_INTERP_P1LV_F16::Inst_VOP3__V_INTERP_P1LV_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_interp_p1lv_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_INTERP_P1LV_F16
-
-    Inst_VOP3__V_INTERP_P1LV_F16::~Inst_VOP3__V_INTERP_P1LV_F16()
-    {
-    } // ~Inst_VOP3__V_INTERP_P1LV_F16
-
-    // --- description from .arch file ---
-    // D.f32 = P10.f16 * S0.f32 + (S2.u32 >> (attr_word * 16)).f16.
-    // 'LV' stands for 'One LDS and one VGPR argument'.
-    // S2 holds two parameters, attr_word selects the high or low word of the
-    // VGPR for this calculation, as well as the high or low half of the LDS
-    // data.
-    // Meant for use with 16-bank LDS.
-    // NOTE: In textual representations the I/J VGPR is the first source and
-    // the attribute is the second source; however in the VOP3 encoding the
-    // attribute is stored in the src0 field and the VGPR is stored in the
-    // src1 field.
-    void
-    Inst_VOP3__V_INTERP_P1LV_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_INTERP_P2_F16 class methods ---
-
-    Inst_VOP3__V_INTERP_P2_F16::Inst_VOP3__V_INTERP_P2_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_interp_p2_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_INTERP_P2_F16
-
-    Inst_VOP3__V_INTERP_P2_F16::~Inst_VOP3__V_INTERP_P2_F16()
-    {
-    } // ~Inst_VOP3__V_INTERP_P2_F16
-
-    // --- description from .arch file ---
-    // D.f16 = P20.f16 * S0.f32 + S2.f32.
-    // Final computation. attr_word selects LDS high or low 16bits. Used for
-    // both 16- and 32-bank LDS.
-    // Result is always written to the 16 LSBs of the destination VGPR.
-    // NOTE: In textual representations the I/J VGPR is the first source and
-    // the attribute is the second source; however in the VOP3 encoding the
-    // attribute is stored in the src0 field and the VGPR is stored in the
-    // src1 field.
-    void
-    Inst_VOP3__V_INTERP_P2_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_ADD_F64 class methods ---
-
-    Inst_VOP3__V_ADD_F64::Inst_VOP3__V_ADD_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_add_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_ADD_F64
-
-    Inst_VOP3__V_ADD_F64::~Inst_VOP3__V_ADD_F64()
-    {
-    } // ~Inst_VOP3__V_ADD_F64
-
-    // --- description from .arch file ---
-    // D.d = S0.d + S1.d.
-    void
-    Inst_VOP3__V_ADD_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isnan(src0[lane]) ||
-                    std::isnan(src1[lane]) ) {
-                        vdst[lane] = NAN;
-                } else if (std::isinf(src0[lane]) &&
-                           std::isinf(src1[lane])) {
-                    if (std::signbit(src0[lane]) !=
-                        std::signbit(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else {
-                        vdst[lane] = src0[lane];
-                    }
-                } else if (std::isinf(src0[lane])) {
-                    vdst[lane] = src0[lane];
-                } else if (std::isinf(src1[lane])) {
-                    vdst[lane] = src1[lane];
-                } else if (std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        if (std::signbit(src0[lane]) &&
-                            std::signbit(src1[lane])) {
-                            vdst[lane] = -0.0;
-                        } else {
-                            vdst[lane] = 0.0;
-                        }
-                    } else {
-                        vdst[lane] = src1[lane];
-                    }
-                } else if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src1[lane]) == FP_ZERO) {
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src0[lane]) == FP_ZERO) {
-                        if (std::signbit(src0[lane]) &&
-                            std::signbit(src1[lane])) {
-                            vdst[lane] = -0.0;
-                        } else {
-                            vdst[lane] = 0.0;
-                        }
-                    } else {
-                        vdst[lane] = src0[lane];
-                    }
-                } else {
-                    vdst[lane] = src0[lane] + src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_F64 class methods ---
-
-    Inst_VOP3__V_MUL_F64::Inst_VOP3__V_MUL_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_MUL_F64
-
-    Inst_VOP3__V_MUL_F64::~Inst_VOP3__V_MUL_F64()
-    {
-    } // ~Inst_VOP3__V_MUL_F64
-
-    // --- description from .arch file ---
-    // D.d = S0.d * S1.d.
-    void
-    Inst_VOP3__V_MUL_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isnan(src0[lane]) ||
-                    std::isnan(src1[lane])) {
-                    vdst[lane] = NAN;
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else {
-                    vdst[lane] = src0[lane] * src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MIN_F64 class methods ---
-
-    Inst_VOP3__V_MIN_F64::Inst_VOP3__V_MIN_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_MIN_F64
-
-    Inst_VOP3__V_MIN_F64::~Inst_VOP3__V_MIN_F64()
-    {
-    } // ~Inst_VOP3__V_MIN_F64
-
-    // --- description from .arch file ---
-    // D.d = min(S0.d, S1.d).
-    void
-    Inst_VOP3__V_MIN_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fmin(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAX_F64 class methods ---
-
-    Inst_VOP3__V_MAX_F64::Inst_VOP3__V_MAX_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_MAX_F64
-
-    Inst_VOP3__V_MAX_F64::~Inst_VOP3__V_MAX_F64()
-    {
-    } // ~Inst_VOP3__V_MAX_F64
-
-    // --- description from .arch file ---
-    // D.d = max(S0.d, S1.d).
-    void
-    Inst_VOP3__V_MAX_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fmax(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LDEXP_F64 class methods ---
-
-    Inst_VOP3__V_LDEXP_F64::Inst_VOP3__V_LDEXP_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ldexp_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_LDEXP_F64
-
-    Inst_VOP3__V_LDEXP_F64::~Inst_VOP3__V_LDEXP_F64()
-    {
-    } // ~Inst_VOP3__V_LDEXP_F64
-
-    // --- description from .arch file ---
-    // D.d = pow(S0.d, S1.i[31:0]).
-    void
-    Inst_VOP3__V_LDEXP_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isnan(src0[lane]) || std::isinf(src0[lane])) {
-                    vdst[lane] = src0[lane];
-                } else if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                           || std::fpclassify(src0[lane]) == FP_ZERO) {
-                    if (std::signbit(src0[lane])) {
-                        vdst[lane] = -0.0;
-                    } else {
-                        vdst[lane] = +0.0;
-                    }
-                } else {
-                    vdst[lane] = std::ldexp(src0[lane], src1[lane]);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_LO_U32 class methods ---
-
-    Inst_VOP3__V_MUL_LO_U32::Inst_VOP3__V_MUL_LO_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_lo_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_LO_U32
-
-    Inst_VOP3__V_MUL_LO_U32::~Inst_VOP3__V_MUL_LO_U32()
-    {
-    } // ~Inst_VOP3__V_MUL_LO_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u * S1.u.
-    void
-    Inst_VOP3__V_MUL_LO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI64 s0 = (VecElemI64)src0[lane];
-                VecElemI64 s1 = (VecElemI64)src1[lane];
-                vdst[lane] = (VecElemU32)((s0 * s1) & 0xffffffffLL);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_HI_U32 class methods ---
-
-    Inst_VOP3__V_MUL_HI_U32::Inst_VOP3__V_MUL_HI_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_hi_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_HI_U32
-
-    Inst_VOP3__V_MUL_HI_U32::~Inst_VOP3__V_MUL_HI_U32()
-    {
-    } // ~Inst_VOP3__V_MUL_HI_U32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u * S1.u) >> 32.
-    void
-    Inst_VOP3__V_MUL_HI_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI64 s0 = (VecElemI64)src0[lane];
-                VecElemI64 s1 = (VecElemI64)src1[lane];
-                vdst[lane]
-                    = (VecElemU32)(((s0 * s1) >> 32) & 0xffffffffLL);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_HI_I32 class methods ---
-
-    Inst_VOP3__V_MUL_HI_I32::Inst_VOP3__V_MUL_HI_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_hi_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_HI_I32
-
-    Inst_VOP3__V_MUL_HI_I32::~Inst_VOP3__V_MUL_HI_I32()
-    {
-    } // ~Inst_VOP3__V_MUL_HI_I32
-
-    // --- description from .arch file ---
-    // D.i = (S0.i * S1.i) >> 32.
-    void
-    Inst_VOP3__V_MUL_HI_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI64 s0 = (VecElemI64)src0[lane];
-                VecElemI64 s1 = (VecElemI64)src1[lane];
-                vdst[lane]
-                    = (VecElemI32)(((s0 * s1) >> 32LL) & 0xffffffffLL);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LDEXP_F32 class methods ---
-
-    Inst_VOP3__V_LDEXP_F32::Inst_VOP3__V_LDEXP_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ldexp_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_LDEXP_F32
-
-    Inst_VOP3__V_LDEXP_F32::~Inst_VOP3__V_LDEXP_F32()
-    {
-    } // ~Inst_VOP3__V_LDEXP_F32
-
-    // --- description from .arch file ---
-    // D.f = pow(S0.f, S1.i)
-    void
-    Inst_VOP3__V_LDEXP_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::ldexp(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_READLANE_B32 class methods ---
-
-    Inst_VOP3__V_READLANE_B32::Inst_VOP3__V_READLANE_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_readlane_b32", true)
-    {
-        setFlag(ALU);
-        setFlag(IgnoreExec);
-    } // Inst_VOP3__V_READLANE_B32
-
-    Inst_VOP3__V_READLANE_B32::~Inst_VOP3__V_READLANE_B32()
-    {
-    } // ~Inst_VOP3__V_READLANE_B32
-
-    // --- description from .arch file ---
-    // Copy one VGPR value to one SGPR. D = SGPR-dest, S0 = Source Data (VGPR#
-    // or M0(lds-direct)), S1 = Lane Select (SGPR or M0). Ignores exec mask.
-    // Input and output modifiers not supported; this is an untyped operation.
-    void
-    Inst_VOP3__V_READLANE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        sdst = src0[src1.rawData() & 0x3f];
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_WRITELANE_B32 class methods ---
-
-    Inst_VOP3__V_WRITELANE_B32::Inst_VOP3__V_WRITELANE_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_writelane_b32", false)
-    {
-        setFlag(ALU);
-        setFlag(IgnoreExec);
-    } // Inst_VOP3__V_WRITELANE_B32
-
-    Inst_VOP3__V_WRITELANE_B32::~Inst_VOP3__V_WRITELANE_B32()
-    {
-    } // ~Inst_VOP3__V_WRITELANE_B32
-
-    // --- description from .arch file ---
-    // Write value into one VGPR in one lane. D = VGPR-dest, S0 = Source Data
-    // (sgpr, m0, exec or constants), S1 = Lane Select (SGPR or M0). Ignores
-    // exec mask.
-    // Input and output modifiers not supported; this is an untyped operation.
-    // SQ translates to V_MOV_B32.
-    void
-    Inst_VOP3__V_WRITELANE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.read();
-        src1.read();
-        vdst.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        vdst[src1.rawData() & 0x3f] = src0.rawData();
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_BCNT_U32_B32 class methods ---
-
-    Inst_VOP3__V_BCNT_U32_B32::Inst_VOP3__V_BCNT_U32_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_bcnt_u32_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_BCNT_U32_B32
-
-    Inst_VOP3__V_BCNT_U32_B32::~Inst_VOP3__V_BCNT_U32_B32()
-    {
-    } // ~Inst_VOP3__V_BCNT_U32_B32
-
-    // --- description from .arch file ---
-    // D.u = CountOneBits(S0.u) + S1.u. Bit count.
-    void
-    Inst_VOP3__V_BCNT_U32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = popCount(src0[lane]) + src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MBCNT_LO_U32_B32 class methods ---
-
-    Inst_VOP3__V_MBCNT_LO_U32_B32::Inst_VOP3__V_MBCNT_LO_U32_B32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mbcnt_lo_u32_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MBCNT_LO_U32_B32
-
-    Inst_VOP3__V_MBCNT_LO_U32_B32::~Inst_VOP3__V_MBCNT_LO_U32_B32()
-    {
-    } // ~Inst_VOP3__V_MBCNT_LO_U32_B32
-
-    // --- description from .arch file ---
-    // ThreadMask = (1 << ThreadPosition) - 1;
-    // D.u = CountOneBits(S0.u & ThreadMask[31:0]) + S1.u.
-    // Masked bit count, ThreadPosition is the position of this thread in the
-    // ---  wavefront (in 0..63).
-    void
-    Inst_VOP3__V_MBCNT_LO_U32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        uint64_t threadMask = 0;
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                threadMask = ((1LL << lane) - 1LL);
-                vdst[lane] = popCount(src0[lane] & bits(threadMask, 31, 0)) +
-                             src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MBCNT_HI_U32_B32 class methods ---
-
-    Inst_VOP3__V_MBCNT_HI_U32_B32::Inst_VOP3__V_MBCNT_HI_U32_B32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mbcnt_hi_u32_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MBCNT_HI_U32_B32
-
-    Inst_VOP3__V_MBCNT_HI_U32_B32::~Inst_VOP3__V_MBCNT_HI_U32_B32()
-    {
-    } // ~Inst_VOP3__V_MBCNT_HI_U32_B32
-
-    // --- description from .arch file ---
-    // ThreadMask = (1 << ThreadPosition) - 1;
-    // D.u = CountOneBits(S0.u & ThreadMask[63:32]) + S1.u.
-    // Masked bit count, ThreadPosition is the position of this thread in the
-    // ---  wavefront (in 0..63).
-    void
-    Inst_VOP3__V_MBCNT_HI_U32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        uint64_t threadMask = 0;
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                threadMask = ((1LL << lane) - 1LL);
-                vdst[lane] = popCount(src0[lane] & bits(threadMask, 63, 32)) +
-                             src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LSHLREV_B64 class methods ---
-
-    Inst_VOP3__V_LSHLREV_B64::Inst_VOP3__V_LSHLREV_B64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_lshlrev_b64", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHLREV_B64
-
-    Inst_VOP3__V_LSHLREV_B64::~Inst_VOP3__V_LSHLREV_B64()
-    {
-    } // ~Inst_VOP3__V_LSHLREV_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S1.u64 << S0.u[5:0].
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP3__V_LSHLREV_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        VecOperandU64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] << bits(src0[lane], 5, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LSHRREV_B64 class methods ---
-
-    Inst_VOP3__V_LSHRREV_B64::Inst_VOP3__V_LSHRREV_B64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_lshrrev_b64", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHRREV_B64
-
-    Inst_VOP3__V_LSHRREV_B64::~Inst_VOP3__V_LSHRREV_B64()
-    {
-    } // ~Inst_VOP3__V_LSHRREV_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S1.u64 >> S0.u[5:0].
-    // The vacated bits are set to zero.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP3__V_LSHRREV_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        VecOperandU64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> bits(src0[lane], 5, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ASHRREV_I64 class methods ---
-
-    Inst_VOP3__V_ASHRREV_I64::Inst_VOP3__V_ASHRREV_I64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ashrrev_i64", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ASHRREV_I64
-
-    Inst_VOP3__V_ASHRREV_I64::~Inst_VOP3__V_ASHRREV_I64()
-    {
-    } // ~Inst_VOP3__V_ASHRREV_I64
-
-    // --- description from .arch file ---
-    // D.u64 = signext(S1.u64) >> S0.u[5:0].
-    // The vacated bits are set to the sign bit of the input value.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP3__V_ASHRREV_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        VecOperandU64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane]
-                    = src1[lane] >> bits(src0[lane], 5, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_TRIG_PREOP_F64 class methods ---
-
-    Inst_VOP3__V_TRIG_PREOP_F64::Inst_VOP3__V_TRIG_PREOP_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_trig_preop_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_TRIG_PREOP_F64
-
-    Inst_VOP3__V_TRIG_PREOP_F64::~Inst_VOP3__V_TRIG_PREOP_F64()
-    {
-    } // ~Inst_VOP3__V_TRIG_PREOP_F64
-
-    // --- description from .arch file ---
-    // D.d = Look Up 2/PI (S0.d) with segment select S1.u[4:0]. This operation
-    // returns an aligned, double precision segment of 2/PI needed to do range
-    // reduction on S0.d (double-precision value). Multiple segments can be
-    // specified through S1.u[4:0]. Rounding is always round-to-zero. Large
-    // inputs (exp > 1968) are scaled to avoid loss of precision through
-    // denormalization.
-    void
-    Inst_VOP3__V_TRIG_PREOP_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_BFM_B32 class methods ---
-
-    Inst_VOP3__V_BFM_B32::Inst_VOP3__V_BFM_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_bfm_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_BFM_B32
-
-    Inst_VOP3__V_BFM_B32::~Inst_VOP3__V_BFM_B32()
-    {
-    } // ~Inst_VOP3__V_BFM_B32
-
-    // --- description from .arch file ---
-    // D.u = ((1<<S0.u[4:0])-1) << S1.u[4:0]; S0 is the bitfield width and S1
-    // is the bitfield offset.
-    void
-    Inst_VOP3__V_BFM_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = ((1 << bits(src0[lane], 4, 0)) - 1)
-                    << bits(src1[lane], 4, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_PKNORM_I16_F32 class methods ---
-
-    Inst_VOP3__V_CVT_PKNORM_I16_F32::Inst_VOP3__V_CVT_PKNORM_I16_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_pknorm_i16_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_PKNORM_I16_F32
-
-    Inst_VOP3__V_CVT_PKNORM_I16_F32::~Inst_VOP3__V_CVT_PKNORM_I16_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_PKNORM_I16_F32
-
-    // --- description from .arch file ---
-    // D = {(snorm)S1.f, (snorm)S0.f}.
-    void
-    Inst_VOP3__V_CVT_PKNORM_I16_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_PKNORM_U16_F32 class methods ---
-
-    Inst_VOP3__V_CVT_PKNORM_U16_F32::Inst_VOP3__V_CVT_PKNORM_U16_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_pknorm_u16_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_PKNORM_U16_F32
-
-    Inst_VOP3__V_CVT_PKNORM_U16_F32::~Inst_VOP3__V_CVT_PKNORM_U16_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_PKNORM_U16_F32
-
-    // --- description from .arch file ---
-    // D = {(unorm)S1.f, (unorm)S0.f}.
-    void
-    Inst_VOP3__V_CVT_PKNORM_U16_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_PKRTZ_F16_F32 class methods ---
-
-    Inst_VOP3__V_CVT_PKRTZ_F16_F32::Inst_VOP3__V_CVT_PKRTZ_F16_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_pkrtz_f16_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_PKRTZ_F16_F32
-
-    Inst_VOP3__V_CVT_PKRTZ_F16_F32::~Inst_VOP3__V_CVT_PKRTZ_F16_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_PKRTZ_F16_F32
-
-    // --- description from .arch file ---
-    // D = {flt32_to_flt16(S1.f),flt32_to_flt16(S0.f)}, with round-toward-zero
-    // ---  regardless of current round mode setting in hardware.
-    // This opcode is intended for use with 16-bit compressed exports.
-    // See V_CVT_F16_F32 for a version that respects the current rounding mode.
-    void
-    Inst_VOP3__V_CVT_PKRTZ_F16_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_PK_U16_U32 class methods ---
-
-    Inst_VOP3__V_CVT_PK_U16_U32::Inst_VOP3__V_CVT_PK_U16_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_pk_u16_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CVT_PK_U16_U32
-
-    Inst_VOP3__V_CVT_PK_U16_U32::~Inst_VOP3__V_CVT_PK_U16_U32()
-    {
-    } // ~Inst_VOP3__V_CVT_PK_U16_U32
-
-    // --- description from .arch file ---
-    // D = {uint32_to_uint16(S1.u), uint32_to_uint16(S0.u)}.
-    void
-    Inst_VOP3__V_CVT_PK_U16_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_PK_I16_I32 class methods ---
-
-    Inst_VOP3__V_CVT_PK_I16_I32::Inst_VOP3__V_CVT_PK_I16_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_pk_i16_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CVT_PK_I16_I32
-
-    Inst_VOP3__V_CVT_PK_I16_I32::~Inst_VOP3__V_CVT_PK_I16_I32()
-    {
-    } // ~Inst_VOP3__V_CVT_PK_I16_I32
-
-    // --- description from .arch file ---
-    // D = {int32_to_int16(S1.i), int32_to_int16(S0.i)}.
-    void
-    Inst_VOP3__V_CVT_PK_I16_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_ADD_U32 class methods ---
-
-    Inst_DS__DS_ADD_U32::Inst_DS__DS_ADD_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_u32")
-    {
-        setFlag(MemoryRef);
-        setFlag(GroupSegment);
-        setFlag(AtomicAdd);
-        setFlag(AtomicNoReturn);
-    } // Inst_DS__DS_ADD_U32
-
-    Inst_DS__DS_ADD_U32::~Inst_DS__DS_ADD_U32()
-    {
-    } // ~Inst_DS__DS_ADD_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // MEM[ADDR] += DATA;
-    void
-    Inst_DS__DS_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_ADD_U32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initAtomicAccess<VecElemU32>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_ADD_U32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_SUB_U32 class methods ---
-
-    Inst_DS__DS_SUB_U32::Inst_DS__DS_SUB_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_sub_u32")
-    {
-    } // Inst_DS__DS_SUB_U32
-
-    Inst_DS__DS_SUB_U32::~Inst_DS__DS_SUB_U32()
-    {
-    } // ~Inst_DS__DS_SUB_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_RSUB_U32 class methods ---
-
-    Inst_DS__DS_RSUB_U32::Inst_DS__DS_RSUB_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_rsub_u32")
-    {
-    } // Inst_DS__DS_RSUB_U32
-
-    Inst_DS__DS_RSUB_U32::~Inst_DS__DS_RSUB_U32()
-    {
-    } // ~Inst_DS__DS_RSUB_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA - MEM[ADDR];
-    // RETURN_DATA = tmp.
-    // Subtraction with reversed operands.
-    void
-    Inst_DS__DS_RSUB_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_INC_U32 class methods ---
-
-    Inst_DS__DS_INC_U32::Inst_DS__DS_INC_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_inc_u32")
-    {
-    } // Inst_DS__DS_INC_U32
-
-    Inst_DS__DS_INC_U32::~Inst_DS__DS_INC_U32()
-    {
-    } // ~Inst_DS__DS_INC_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_INC_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_DEC_U32 class methods ---
-
-    Inst_DS__DS_DEC_U32::Inst_DS__DS_DEC_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_dec_u32")
-    {
-    } // Inst_DS__DS_DEC_U32
-
-    Inst_DS__DS_DEC_U32::~Inst_DS__DS_DEC_U32()
-    {
-    } // ~Inst_DS__DS_DEC_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
-    // (unsigned compare); RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_DEC_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_I32 class methods ---
-
-    Inst_DS__DS_MIN_I32::Inst_DS__DS_MIN_I32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_i32")
-    {
-    } // Inst_DS__DS_MIN_I32
-
-    Inst_DS__DS_MIN_I32::~Inst_DS__DS_MIN_I32()
-    {
-    } // ~Inst_DS__DS_MIN_I32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_I32 class methods ---
-
-    Inst_DS__DS_MAX_I32::Inst_DS__DS_MAX_I32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_i32")
-    {
-    } // Inst_DS__DS_MAX_I32
-
-    Inst_DS__DS_MAX_I32::~Inst_DS__DS_MAX_I32()
-    {
-    } // ~Inst_DS__DS_MAX_I32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_U32 class methods ---
-
-    Inst_DS__DS_MIN_U32::Inst_DS__DS_MIN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_u32")
-    {
-    } // Inst_DS__DS_MIN_U32
-
-    Inst_DS__DS_MIN_U32::~Inst_DS__DS_MIN_U32()
-    {
-    } // ~Inst_DS__DS_MIN_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_U32 class methods ---
-
-    Inst_DS__DS_MAX_U32::Inst_DS__DS_MAX_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_u32")
-    {
-    } // Inst_DS__DS_MAX_U32
-
-    Inst_DS__DS_MAX_U32::~Inst_DS__DS_MAX_U32()
-    {
-    } // ~Inst_DS__DS_MAX_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_AND_B32 class methods ---
-
-    Inst_DS__DS_AND_B32::Inst_DS__DS_AND_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_and_b32")
-    {
-    } // Inst_DS__DS_AND_B32
-
-    Inst_DS__DS_AND_B32::~Inst_DS__DS_AND_B32()
-    {
-    } // ~Inst_DS__DS_AND_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_AND_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_OR_B32 class methods ---
-
-    Inst_DS__DS_OR_B32::Inst_DS__DS_OR_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_or_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(GroupSegment);
-        setFlag(AtomicOr);
-        setFlag(AtomicNoReturn);
-    } // Inst_DS__DS_OR_B32
-
-    Inst_DS__DS_OR_B32::~Inst_DS__DS_OR_B32()
-    {
-    } // ~Inst_DS__DS_OR_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // MEM[ADDR] |= DATA;
-    void
-    Inst_DS__DS_OR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_OR_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initAtomicAccess<VecElemU32>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_OR_B32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-
-    // --- Inst_DS__DS_XOR_B32 class methods ---
-
-    Inst_DS__DS_XOR_B32::Inst_DS__DS_XOR_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_xor_b32")
-    {
-    } // Inst_DS__DS_XOR_B32
-
-    Inst_DS__DS_XOR_B32::~Inst_DS__DS_XOR_B32()
-    {
-    } // ~Inst_DS__DS_XOR_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MSKOR_B32 class methods ---
-
-    Inst_DS__DS_MSKOR_B32::Inst_DS__DS_MSKOR_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_mskor_b32")
-    {
-    } // Inst_DS__DS_MSKOR_B32
-
-    Inst_DS__DS_MSKOR_B32::~Inst_DS__DS_MSKOR_B32()
-    {
-    } // ~Inst_DS__DS_MSKOR_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
-    // RETURN_DATA = tmp.
-    // Masked dword OR, D0 contains the mask and D1 contains the new value.
-    void
-    Inst_DS__DS_MSKOR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRITE_B32 class methods ---
-
-    Inst_DS__DS_WRITE_B32::Inst_DS__DS_WRITE_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_B32
-
-    Inst_DS__DS_WRITE_B32::~Inst_DS__DS_WRITE_B32()
-    {
-    } // ~Inst_DS__DS_WRITE_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // MEM[ADDR] = DATA.
-    // Write dword.
-    void
-    Inst_DS__DS_WRITE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemWrite<VecElemU32>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_WRITE_B32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_WRITE2_B32 class methods ---
-
-    Inst_DS__DS_WRITE2_B32::Inst_DS__DS_WRITE2_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write2_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE2_B32
-
-    Inst_DS__DS_WRITE2_B32::~Inst_DS__DS_WRITE2_B32()
-    {
-    } // ~Inst_DS__DS_WRITE2_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // MEM[ADDR_BASE + OFFSET0 * 4] = DATA;
-    // MEM[ADDR_BASE + OFFSET1 * 4] = DATA2.
-    // Write 2 dwords.
-    void
-    Inst_DS__DS_WRITE2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
-        ConstVecOperandU32 data1(gpuDynInst, extData.DATA1);
-
-        addr.read();
-        data0.read();
-        data1.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 2]
-                    = data0[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE2_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0 * 4;
-        Addr offset1 = instData.OFFSET1 * 4;
-
-        initDualMemWrite<VecElemU32>(gpuDynInst, offset0, offset1);
-    }
-
-    void
-    Inst_DS__DS_WRITE2_B32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-    // --- Inst_DS__DS_WRITE2ST64_B32 class methods ---
-
-    Inst_DS__DS_WRITE2ST64_B32::Inst_DS__DS_WRITE2ST64_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write2st64_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE2ST64_B32
-
-    Inst_DS__DS_WRITE2ST64_B32::~Inst_DS__DS_WRITE2ST64_B32()
-    {
-    } // ~Inst_DS__DS_WRITE2ST64_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // MEM[ADDR_BASE + OFFSET0 * 4 * 64] = DATA;
-    // MEM[ADDR_BASE + OFFSET1 * 4 * 64] = DATA2;
-    // Write 2 dwords.
-    void
-    Inst_DS__DS_WRITE2ST64_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
-        ConstVecOperandU32 data1(gpuDynInst, extData.DATA1);
-
-        addr.read();
-        data0.read();
-        data1.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 2]
-                    = data0[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE2ST64_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0 * 4 * 64;
-        Addr offset1 = instData.OFFSET1 * 4 * 64;
-
-        initDualMemWrite<VecElemU32>(gpuDynInst, offset0, offset1);
-    }
-
-    void
-    Inst_DS__DS_WRITE2ST64_B32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-    // --- Inst_DS__DS_CMPST_B32 class methods ---
-
-    Inst_DS__DS_CMPST_B32::Inst_DS__DS_CMPST_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_b32")
-    {
-    } // Inst_DS__DS_CMPST_B32
-
-    Inst_DS__DS_CMPST_B32::~Inst_DS__DS_CMPST_B32()
-    {
-    } // ~Inst_DS__DS_CMPST_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    // Compare and store.
-    // Caution, the order of src and cmp are the *opposite* of the
-    // ---  BUFFER_ATOMIC_CMPSWAP opcode.
-    void
-    Inst_DS__DS_CMPST_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_CMPST_F32 class methods ---
-
-    Inst_DS__DS_CMPST_F32::Inst_DS__DS_CMPST_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_CMPST_F32
-
-    Inst_DS__DS_CMPST_F32::~Inst_DS__DS_CMPST_F32()
-    {
-    } // ~Inst_DS__DS_CMPST_F32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    // Floating point compare and store that handles NaN/INF/denormal values.
-    // Caution, the order of src and cmp are the *opposite* of the
-    // ---  BUFFER_ATOMIC_FCMPSWAP opcode.
-    void
-    Inst_DS__DS_CMPST_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_F32 class methods ---
-
-    Inst_DS__DS_MIN_F32::Inst_DS__DS_MIN_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_MIN_F32
-
-    Inst_DS__DS_MIN_F32::~Inst_DS__DS_MIN_F32()
-    {
-    } // ~Inst_DS__DS_MIN_F32
-
-    // --- description from .arch file ---
-    // 32b.
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
-    // Floating point minimum that handles NaN/INF/denormal values.
-    // Note that this opcode is slightly more general-purpose than
-    // ---  BUFFER_ATOMIC_FMIN.
-    void
-    Inst_DS__DS_MIN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_F32 class methods ---
-
-    Inst_DS__DS_MAX_F32::Inst_DS__DS_MAX_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_MAX_F32
-
-    Inst_DS__DS_MAX_F32::~Inst_DS__DS_MAX_F32()
-    {
-    } // ~Inst_DS__DS_MAX_F32
-
-    // --- description from .arch file ---
-    // 32b.
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
-    // Floating point maximum that handles NaN/INF/denormal values.
-    // Note that this opcode is slightly more general-purpose than
-    // ---  BUFFER_ATOMIC_FMAX.
-    void
-    Inst_DS__DS_MAX_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_NOP class methods ---
-
-    Inst_DS__DS_NOP::Inst_DS__DS_NOP(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_nop")
-    {
-        setFlag(Nop);
-    } // Inst_DS__DS_NOP
-
-    Inst_DS__DS_NOP::~Inst_DS__DS_NOP()
-    {
-    } // ~Inst_DS__DS_NOP
-
-    // --- description from .arch file ---
-    // Do nothing.
-    void
-    Inst_DS__DS_NOP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        gpuDynInst->wavefront()->decLGKMInstsIssued();
-    } // execute
-    // --- Inst_DS__DS_ADD_F32 class methods ---
-
-    Inst_DS__DS_ADD_F32::Inst_DS__DS_ADD_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_f32")
-    {
-        setFlag(F32);
-        setFlag(MemoryRef);
-        setFlag(GroupSegment);
-        setFlag(AtomicAdd);
-        setFlag(AtomicNoReturn);
-    } // Inst_DS__DS_ADD_F32
-
-    Inst_DS__DS_ADD_F32::~Inst_DS__DS_ADD_F32()
-    {
-    } // ~Inst_DS__DS_ADD_F32
-
-    // --- description from .arch file ---
-    // 32b:
-    // MEM[ADDR] += DATA;
-    // Floating point add that handles NaN/INF/denormal values.
-    void
-    Inst_DS__DS_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandF32 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemF32*>(gpuDynInst->a_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_ADD_F32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initAtomicAccess<VecElemF32>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_ADD_F32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_WRITE_B8 class methods ---
-
-    Inst_DS__DS_WRITE_B8::Inst_DS__DS_WRITE_B8(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_b8")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_B8
-
-    Inst_DS__DS_WRITE_B8::~Inst_DS__DS_WRITE_B8()
-    {
-    } // ~Inst_DS__DS_WRITE_B8
-
-    // --- description from .arch file ---
-    // MEM[ADDR] = DATA[7:0].
-    // Byte write.
-    void
-    Inst_DS__DS_WRITE_B8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU8 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU8*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE_B8::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemWrite<VecElemU8>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_WRITE_B8::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_WRITE_B8_D16_HI class methods ---
-
-    Inst_DS__DS_WRITE_B8_D16_HI::Inst_DS__DS_WRITE_B8_D16_HI(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_b8_d16_hi")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_B8_D16_HI
-
-    Inst_DS__DS_WRITE_B8_D16_HI::~Inst_DS__DS_WRITE_B8_D16_HI()
-    {
-    } // ~Inst_DS__DS_WRITE_B8_D16_HI
-
-    // --- description from .arch file ---
-    // MEM[ADDR] = DATA[23:16].
-    // Byte write in to high word.
-    void
-    Inst_DS__DS_WRITE_B8_D16_HI::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU8 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU8*>(gpuDynInst->d_data))[lane]
-                    = bits(data[lane], 23, 16);
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE_B8_D16_HI::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemWrite<VecElemU8>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_WRITE_B8_D16_HI::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_WRITE_B16 class methods ---
-
-    Inst_DS__DS_WRITE_B16::Inst_DS__DS_WRITE_B16(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_b16")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_B16
-
-    Inst_DS__DS_WRITE_B16::~Inst_DS__DS_WRITE_B16()
-    {
-    } // ~Inst_DS__DS_WRITE_B16
-
-    // --- description from .arch file ---
-    // MEM[ADDR] = DATA[15:0]
-    // Short write.
-    void
-    Inst_DS__DS_WRITE_B16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU16 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU16*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE_B16::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemWrite<VecElemU16>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_WRITE_B16::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_ADD_RTN_U32 class methods ---
-
-    Inst_DS__DS_ADD_RTN_U32::Inst_DS__DS_ADD_RTN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_rtn_u32")
-    {
-    } // Inst_DS__DS_ADD_RTN_U32
-
-    Inst_DS__DS_ADD_RTN_U32::~Inst_DS__DS_ADD_RTN_U32()
-    {
-    } // ~Inst_DS__DS_ADD_RTN_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_ADD_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_SUB_RTN_U32 class methods ---
-
-    Inst_DS__DS_SUB_RTN_U32::Inst_DS__DS_SUB_RTN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_sub_rtn_u32")
-    {
-    } // Inst_DS__DS_SUB_RTN_U32
-
-    Inst_DS__DS_SUB_RTN_U32::~Inst_DS__DS_SUB_RTN_U32()
-    {
-    } // ~Inst_DS__DS_SUB_RTN_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_SUB_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_RSUB_RTN_U32 class methods ---
-
-    Inst_DS__DS_RSUB_RTN_U32::Inst_DS__DS_RSUB_RTN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_rsub_rtn_u32")
-    {
-    } // Inst_DS__DS_RSUB_RTN_U32
-
-    Inst_DS__DS_RSUB_RTN_U32::~Inst_DS__DS_RSUB_RTN_U32()
-    {
-    } // ~Inst_DS__DS_RSUB_RTN_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA - MEM[ADDR];
-    // RETURN_DATA = tmp.
-    // Subtraction with reversed operands.
-    void
-    Inst_DS__DS_RSUB_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_INC_RTN_U32 class methods ---
-
-    Inst_DS__DS_INC_RTN_U32::Inst_DS__DS_INC_RTN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_inc_rtn_u32")
-    {
-    } // Inst_DS__DS_INC_RTN_U32
-
-    Inst_DS__DS_INC_RTN_U32::~Inst_DS__DS_INC_RTN_U32()
-    {
-    } // ~Inst_DS__DS_INC_RTN_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_INC_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_DEC_RTN_U32 class methods ---
-
-    Inst_DS__DS_DEC_RTN_U32::Inst_DS__DS_DEC_RTN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_dec_rtn_u32")
-    {
-    } // Inst_DS__DS_DEC_RTN_U32
-
-    Inst_DS__DS_DEC_RTN_U32::~Inst_DS__DS_DEC_RTN_U32()
-    {
-    } // ~Inst_DS__DS_DEC_RTN_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
-    // (unsigned compare); RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_DEC_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_RTN_I32 class methods ---
-
-    Inst_DS__DS_MIN_RTN_I32::Inst_DS__DS_MIN_RTN_I32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_rtn_i32")
-    {
-    } // Inst_DS__DS_MIN_RTN_I32
-
-    Inst_DS__DS_MIN_RTN_I32::~Inst_DS__DS_MIN_RTN_I32()
-    {
-    } // ~Inst_DS__DS_MIN_RTN_I32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MIN_RTN_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_RTN_I32 class methods ---
-
-    Inst_DS__DS_MAX_RTN_I32::Inst_DS__DS_MAX_RTN_I32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_rtn_i32")
-    {
-    } // Inst_DS__DS_MAX_RTN_I32
-
-    Inst_DS__DS_MAX_RTN_I32::~Inst_DS__DS_MAX_RTN_I32()
-    {
-    } // ~Inst_DS__DS_MAX_RTN_I32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MAX_RTN_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_RTN_U32 class methods ---
-
-    Inst_DS__DS_MIN_RTN_U32::Inst_DS__DS_MIN_RTN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_rtn_u32")
-    {
-    } // Inst_DS__DS_MIN_RTN_U32
-
-    Inst_DS__DS_MIN_RTN_U32::~Inst_DS__DS_MIN_RTN_U32()
-    {
-    } // ~Inst_DS__DS_MIN_RTN_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MIN_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_RTN_U32 class methods ---
-
-    Inst_DS__DS_MAX_RTN_U32::Inst_DS__DS_MAX_RTN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_rtn_u32")
-    {
-    } // Inst_DS__DS_MAX_RTN_U32
-
-    Inst_DS__DS_MAX_RTN_U32::~Inst_DS__DS_MAX_RTN_U32()
-    {
-    } // ~Inst_DS__DS_MAX_RTN_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MAX_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_AND_RTN_B32 class methods ---
-
-    Inst_DS__DS_AND_RTN_B32::Inst_DS__DS_AND_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_and_rtn_b32")
-    {
-    } // Inst_DS__DS_AND_RTN_B32
-
-    Inst_DS__DS_AND_RTN_B32::~Inst_DS__DS_AND_RTN_B32()
-    {
-    } // ~Inst_DS__DS_AND_RTN_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_AND_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_OR_RTN_B32 class methods ---
-
-    Inst_DS__DS_OR_RTN_B32::Inst_DS__DS_OR_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_or_rtn_b32")
-    {
-    } // Inst_DS__DS_OR_RTN_B32
-
-    Inst_DS__DS_OR_RTN_B32::~Inst_DS__DS_OR_RTN_B32()
-    {
-    } // ~Inst_DS__DS_OR_RTN_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_OR_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_XOR_RTN_B32 class methods ---
-
-    Inst_DS__DS_XOR_RTN_B32::Inst_DS__DS_XOR_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_xor_rtn_b32")
-    {
-    } // Inst_DS__DS_XOR_RTN_B32
-
-    Inst_DS__DS_XOR_RTN_B32::~Inst_DS__DS_XOR_RTN_B32()
-    {
-    } // ~Inst_DS__DS_XOR_RTN_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_XOR_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MSKOR_RTN_B32 class methods ---
-
-    Inst_DS__DS_MSKOR_RTN_B32::Inst_DS__DS_MSKOR_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_mskor_rtn_b32")
-    {
-    } // Inst_DS__DS_MSKOR_RTN_B32
-
-    Inst_DS__DS_MSKOR_RTN_B32::~Inst_DS__DS_MSKOR_RTN_B32()
-    {
-    } // ~Inst_DS__DS_MSKOR_RTN_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
-    // RETURN_DATA = tmp.
-    // Masked dword OR, D0 contains the mask and D1 contains the new value.
-    void
-    Inst_DS__DS_MSKOR_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRXCHG_RTN_B32 class methods ---
-
-    Inst_DS__DS_WRXCHG_RTN_B32::Inst_DS__DS_WRXCHG_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_wrxchg_rtn_b32")
-    {
-    } // Inst_DS__DS_WRXCHG_RTN_B32
-
-    Inst_DS__DS_WRXCHG_RTN_B32::~Inst_DS__DS_WRXCHG_RTN_B32()
-    {
-    } // ~Inst_DS__DS_WRXCHG_RTN_B32
-
-    // --- description from .arch file ---
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA;
-    // RETURN_DATA = tmp.
-    // Write-exchange operation.
-    void
-    Inst_DS__DS_WRXCHG_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRXCHG2_RTN_B32 class methods ---
-
-    Inst_DS__DS_WRXCHG2_RTN_B32::Inst_DS__DS_WRXCHG2_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_wrxchg2_rtn_b32")
-    {
-    } // Inst_DS__DS_WRXCHG2_RTN_B32
-
-    Inst_DS__DS_WRXCHG2_RTN_B32::~Inst_DS__DS_WRXCHG2_RTN_B32()
-    {
-    } // ~Inst_DS__DS_WRXCHG2_RTN_B32
-
-    // --- description from .arch file ---
-    // Write-exchange 2 separate dwords.
-    void
-    Inst_DS__DS_WRXCHG2_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRXCHG2ST64_RTN_B32 class methods ---
-
-    Inst_DS__DS_WRXCHG2ST64_RTN_B32::Inst_DS__DS_WRXCHG2ST64_RTN_B32(
-          InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_wrxchg2st64_rtn_b32")
-    {
-    } // Inst_DS__DS_WRXCHG2ST64_RTN_B32
-
-    Inst_DS__DS_WRXCHG2ST64_RTN_B32::~Inst_DS__DS_WRXCHG2ST64_RTN_B32()
-    {
-    } // ~Inst_DS__DS_WRXCHG2ST64_RTN_B32
-
-    // --- description from .arch file ---
-    // Write-exchange 2 separate dwords with a stride of 64 dwords.
-    void
-    Inst_DS__DS_WRXCHG2ST64_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_CMPST_RTN_B32 class methods ---
-
-    Inst_DS__DS_CMPST_RTN_B32::Inst_DS__DS_CMPST_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_rtn_b32")
-    {
-    } // Inst_DS__DS_CMPST_RTN_B32
-
-    Inst_DS__DS_CMPST_RTN_B32::~Inst_DS__DS_CMPST_RTN_B32()
-    {
-    } // ~Inst_DS__DS_CMPST_RTN_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    // Compare and store.
-    // Caution, the order of src and cmp are the *opposite* of the
-    // ---  BUFFER_ATOMIC_CMPSWAP opcode.
-    void
-    Inst_DS__DS_CMPST_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_CMPST_RTN_F32 class methods ---
-
-    Inst_DS__DS_CMPST_RTN_F32::Inst_DS__DS_CMPST_RTN_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_rtn_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_CMPST_RTN_F32
-
-    Inst_DS__DS_CMPST_RTN_F32::~Inst_DS__DS_CMPST_RTN_F32()
-    {
-    } // ~Inst_DS__DS_CMPST_RTN_F32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    // Floating point compare and store that handles NaN/INF/denormal values.
-    // Caution, the order of src and cmp are the *opposite* of the
-    // ---  BUFFER_ATOMIC_FCMPSWAP opcode.
-    void
-    Inst_DS__DS_CMPST_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_RTN_F32 class methods ---
-
-    Inst_DS__DS_MIN_RTN_F32::Inst_DS__DS_MIN_RTN_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_rtn_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_MIN_RTN_F32
-
-    Inst_DS__DS_MIN_RTN_F32::~Inst_DS__DS_MIN_RTN_F32()
-    {
-    } // ~Inst_DS__DS_MIN_RTN_F32
-
-    // --- description from .arch file ---
-    // 32b.
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
-    // Floating point minimum that handles NaN/INF/denormal values.
-    // Note that this opcode is slightly more general-purpose than
-    // ---  BUFFER_ATOMIC_FMIN.
-    void
-    Inst_DS__DS_MIN_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_RTN_F32 class methods ---
-
-    Inst_DS__DS_MAX_RTN_F32::Inst_DS__DS_MAX_RTN_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_rtn_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_MAX_RTN_F32
-
-    Inst_DS__DS_MAX_RTN_F32::~Inst_DS__DS_MAX_RTN_F32()
-    {
-    } // ~Inst_DS__DS_MAX_RTN_F32
-
-    // --- description from .arch file ---
-    // 32b.
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
-    // Floating point maximum that handles NaN/INF/denormal values.
-    // Note that this opcode is slightly more general-purpose than
-    // ---  BUFFER_ATOMIC_FMAX.
-    void
-    Inst_DS__DS_MAX_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRAP_RTN_B32 class methods ---
-
-    Inst_DS__DS_WRAP_RTN_B32::Inst_DS__DS_WRAP_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_wrap_rtn_b32")
-    {
-    } // Inst_DS__DS_WRAP_RTN_B32
-
-    Inst_DS__DS_WRAP_RTN_B32::~Inst_DS__DS_WRAP_RTN_B32()
-    {
-    } // ~Inst_DS__DS_WRAP_RTN_B32
-
-    // --- description from .arch file ---
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA) ? tmp - DATA : tmp + DATA2;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_WRAP_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_ADD_RTN_F32 class methods ---
-
-    Inst_DS__DS_ADD_RTN_F32::Inst_DS__DS_ADD_RTN_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_rtn_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_ADD_RTN_F32
-
-    Inst_DS__DS_ADD_RTN_F32::~Inst_DS__DS_ADD_RTN_F32()
-    {
-    } // ~Inst_DS__DS_ADD_RTN_F32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA;
-    // RETURN_DATA = tmp.
-    // Floating point add that handles NaN/INF/denormal values.
-    void
-    Inst_DS__DS_ADD_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_READ_B32 class methods ---
-
-    Inst_DS__DS_READ_B32::Inst_DS__DS_READ_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_B32
-
-    Inst_DS__DS_READ_B32::~Inst_DS__DS_READ_B32()
-    {
-    } // ~Inst_DS__DS_READ_B32
-
-    // --- description from .arch file ---
-    // RETURN_DATA = MEM[ADDR].
-    // Dword read.
-    void
-    Inst_DS__DS_READ_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemRead<VecElemU32>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ_B32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane];
-            }
-        }
-
-        vdst.write();
-    } // completeAcc
-    // --- Inst_DS__DS_READ2_B32 class methods ---
-
-    Inst_DS__DS_READ2_B32::Inst_DS__DS_READ2_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read2_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ2_B32
-
-    Inst_DS__DS_READ2_B32::~Inst_DS__DS_READ2_B32()
-    {
-    } // ~Inst_DS__DS_READ2_B32
-
-    // --- description from .arch file ---
-    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 4];
-    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 4].
-    // Read 2 dwords.
-    void
-    Inst_DS__DS_READ2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ2_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0 * 4;
-        Addr offset1 = instData.OFFSET1 * 4;
-
-        initDualMemRead<VecElemU32>(gpuDynInst, offset0, offset1);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ2_B32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 2];
-                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 2 + 1];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-    } // completeAcc
-    // --- Inst_DS__DS_READ2ST64_B32 class methods ---
-
-    Inst_DS__DS_READ2ST64_B32::Inst_DS__DS_READ2ST64_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read2st64_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ2ST64_B32
-
-    Inst_DS__DS_READ2ST64_B32::~Inst_DS__DS_READ2ST64_B32()
-    {
-    } // ~Inst_DS__DS_READ2ST64_B32
-
-    // --- description from .arch file ---
-    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 4 * 64];
-    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 4 * 64].
-    // Read 2 dwords.
-    void
-    Inst_DS__DS_READ2ST64_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ2ST64_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = (instData.OFFSET0 * 4 * 64);
-        Addr offset1 = (instData.OFFSET1 * 4 * 64);
-
-        initDualMemRead<VecElemU32>(gpuDynInst, offset0, offset1);
-    }
-
-    void
-    Inst_DS__DS_READ2ST64_B32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 2];
-                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 2 + 1];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-    }
-    // --- Inst_DS__DS_READ_I8 class methods ---
-
-    Inst_DS__DS_READ_I8::Inst_DS__DS_READ_I8(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_i8")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_I8
-
-    Inst_DS__DS_READ_I8::~Inst_DS__DS_READ_I8()
-    {
-    } // ~Inst_DS__DS_READ_I8
-
-    // --- description from .arch file ---
-    // RETURN_DATA = signext(MEM[ADDR][7:0]).
-    // Signed byte read.
-    void
-    Inst_DS__DS_READ_I8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ_I8::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemRead<VecElemI8>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ_I8::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (VecElemU32)sext<8>((reinterpret_cast<VecElemI8*>(
-                    gpuDynInst->d_data))[lane]);
-            }
-        }
-
-        vdst.write();
-    } // completeAcc
-    // --- Inst_DS__DS_READ_U8 class methods ---
-
-    Inst_DS__DS_READ_U8::Inst_DS__DS_READ_U8(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_u8")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_U8
-
-    Inst_DS__DS_READ_U8::~Inst_DS__DS_READ_U8()
-    {
-    } // ~Inst_DS__DS_READ_U8
-
-    // --- description from .arch file ---
-    // RETURN_DATA = {24'h0,MEM[ADDR][7:0]}.
-    // Unsigned byte read.
-    void
-    Inst_DS__DS_READ_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ_U8::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemRead<VecElemU8>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ_U8::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (VecElemU32)(reinterpret_cast<VecElemU8*>(
-                    gpuDynInst->d_data))[lane];
-            }
-        }
-
-        vdst.write();
-    } // completeAcc
-    // --- Inst_DS__DS_READ_I16 class methods ---
-
-    Inst_DS__DS_READ_I16::Inst_DS__DS_READ_I16(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_i16")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_I16
-
-    Inst_DS__DS_READ_I16::~Inst_DS__DS_READ_I16()
-    {
-    } // ~Inst_DS__DS_READ_I16
-
-    // --- description from .arch file ---
-    // RETURN_DATA = signext(MEM[ADDR][15:0]).
-    // Signed short read.
-    void
-    Inst_DS__DS_READ_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_READ_U16 class methods ---
-
-    Inst_DS__DS_READ_U16::Inst_DS__DS_READ_U16(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_u16")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_U16
-
-    Inst_DS__DS_READ_U16::~Inst_DS__DS_READ_U16()
-    {
-    } // ~Inst_DS__DS_READ_U16
-
-    // --- description from .arch file ---
-    // RETURN_DATA = {16'h0,MEM[ADDR][15:0]}.
-    // Unsigned short read.
-    void
-    Inst_DS__DS_READ_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-    void
-    Inst_DS__DS_READ_U16::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemRead<VecElemU16>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ_U16::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (VecElemU32)(reinterpret_cast<VecElemU16*>(
-                    gpuDynInst->d_data))[lane];
-            }
-        }
-
-        vdst.write();
-    } // completeAcc
-    // --- Inst_DS__DS_SWIZZLE_B32 class methods ---
-
-    Inst_DS__DS_SWIZZLE_B32::Inst_DS__DS_SWIZZLE_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_swizzle_b32")
-    {
-        /**
-         * While this operation doesn't actually use DS storage we classify
-         * it as a load here because it does a writeback to a VGPR, which
-         * fits in better with the LDS pipeline logic.
-         */
-         setFlag(Load);
-    } // Inst_DS__DS_SWIZZLE_B32
-
-    Inst_DS__DS_SWIZZLE_B32::~Inst_DS__DS_SWIZZLE_B32()
-    {
-    } // ~Inst_DS__DS_SWIZZLE_B32
-
-    // --- description from .arch file ---
-    // RETURN_DATA = swizzle(vgpr_data, offset1:offset0).
-    // Dword swizzle, no data is written to LDS memory; See ds_opcodes.docx for
-    // ---  details.
-    void
-    Inst_DS__DS_SWIZZLE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        wf->decLGKMInstsIssued();
-
-        if (gpuDynInst->exec_mask.none()) {
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()
-                                ->cyclesToTicks(Cycles(24)));
-
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-        /**
-         * The "DS pattern" is comprised of both offset fields. That is, the
-         * swizzle pattern between lanes. Bit 15 of the DS pattern dictates
-         * which swizzle mode to use. There are two different swizzle
-         * patterns: 1) QDMode and 2) Bit-masks mode. If bit 15 is set use
-         * QDMode else use Bit-masks mode. The remaining bits dictate how to
-         * swizzle the lanes.
-         *
-         * QDMode:      Chunks the lanes into 4s and swizzles among them.
-         *              Bits 7:6 dictate where lane 3 (of the current chunk)
-         *              gets its date, 5:4 lane 2, etc.
-         *
-         * Bit-mask:    This mode breaks bits 14:0 into 3 equal-sized chunks.
-         *              14:10 is the xor_mask, 9:5 is the or_mask, and 4:0
-         *              is the and_mask. Each lane is swizzled by performing
-         *              the appropriate operation using these masks.
-         */
-        VecElemU16 ds_pattern = ((instData.OFFSET1 << 8) | instData.OFFSET0);
-
-        data.read();
-
-        if (bits(ds_pattern, 15)) {
-            // QDMode
-            for (int lane = 0; lane < NumVecElemPerVecReg; lane += 4) {
-                /**
-                 * This operation allows data sharing between groups
-                 * of four consecutive threads. Note the increment by
-                 * 4 in the for loop.
-                 */
-                if (gpuDynInst->exec_mask[lane]) {
-                    int index0 = lane + bits(ds_pattern, 1, 0);
-                    panic_if(index0 >= NumVecElemPerVecReg, "%s: index0 (%d) "
-                             "is out of bounds.\n", gpuDynInst->disassemble(),
-                             index0);
-                    vdst[lane]
-                        = gpuDynInst->exec_mask[index0] ? data[index0]: 0;
-                }
-                if (gpuDynInst->exec_mask[lane + 1]) {
-                    int index1 = lane + bits(ds_pattern, 3, 2);
-                    panic_if(index1 >= NumVecElemPerVecReg, "%s: index1 (%d) "
-                             "is out of bounds.\n", gpuDynInst->disassemble(),
-                             index1);
-                    vdst[lane + 1]
-                        = gpuDynInst->exec_mask[index1] ? data[index1]: 0;
-                }
-                if (gpuDynInst->exec_mask[lane + 2]) {
-                    int index2 = lane + bits(ds_pattern, 5, 4);
-                    panic_if(index2 >= NumVecElemPerVecReg, "%s: index2 (%d) "
-                             "is out of bounds.\n", gpuDynInst->disassemble(),
-                             index2);
-                    vdst[lane + 2]
-                        = gpuDynInst->exec_mask[index2] ? data[index2]: 0;
-                }
-                if (gpuDynInst->exec_mask[lane + 3]) {
-                    int index3 = lane + bits(ds_pattern, 7, 6);
-                    panic_if(index3 >= NumVecElemPerVecReg, "%s: index3 (%d) "
-                             "is out of bounds.\n", gpuDynInst->disassemble(),
-                             index3);
-                    vdst[lane + 3]
-                        = gpuDynInst->exec_mask[index3] ? data[index3]: 0;
-                }
-            }
-        } else {
-            // Bit Mode
-            int and_mask = bits(ds_pattern, 4, 0);
-            int or_mask = bits(ds_pattern, 9, 5);
-            int xor_mask = bits(ds_pattern, 14, 10);
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    int index = (((lane & and_mask) | or_mask) ^ xor_mask);
-                    // Adjust for the next 32 lanes.
-                    if (lane > 31) {
-                        index += 32;
-                    }
-                    panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is "
-                             "out of bounds.\n", gpuDynInst->disassemble(),
-                             index);
-                    vdst[lane]
-                        = gpuDynInst->exec_mask[index] ? data[index] : 0;
-                }
-            }
-        }
-
-        vdst.write();
-
-        /**
-         * This is needed because we treat this instruction as a load
-         * but it's not an actual memory request.
-         * Without this, the destination register never gets marked as
-         * free, leading to a  possible deadlock
-         */
-        wf->computeUnit->vrf[wf->simdId]->
-            scheduleWriteOperandsFromLoad(wf, gpuDynInst);
-        /**
-         * Similarly, this counter could build up over time, even across
-         * multiple wavefronts, and cause a deadlock.
-         */
-        wf->rdLmReqsInPipe--;
-    } // execute
-    // --- Inst_DS__DS_PERMUTE_B32 class methods ---
-
-    Inst_DS__DS_PERMUTE_B32::Inst_DS__DS_PERMUTE_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_permute_b32")
-    {
-        setFlag(MemoryRef);
-        /**
-         * While this operation doesn't actually use DS storage we classify
-         * it as a load here because it does a writeback to a VGPR, which
-         * fits in better with the LDS pipeline logic.
-         */
-         setFlag(Load);
-    } // Inst_DS__DS_PERMUTE_B32
-
-    Inst_DS__DS_PERMUTE_B32::~Inst_DS__DS_PERMUTE_B32()
-    {
-    } // ~Inst_DS__DS_PERMUTE_B32
-
-    // --- description from .arch file ---
-    // Forward permute.
-    void
-    Inst_DS__DS_PERMUTE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        wf->decLGKMInstsIssued();
-
-        if (gpuDynInst->exec_mask.none()) {
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()
-                                ->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        addr.read();
-        data.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                /**
-                 * One of the offset fields can be used for the index.
-                 * It is assumed OFFSET0 would be used, as OFFSET1 is
-                 * typically only used for DS ops that operate on two
-                 * disparate pieces of data.
-                 */
-                assert(!instData.OFFSET1);
-                /**
-                 * The address provided is a byte address, but VGPRs are
-                 * 4 bytes, so we must divide by 4 to get the actual VGPR
-                 * index. Additionally, the index is calculated modulo the
-                 * WF size, 64 in this case, so we simply extract bits 7-2.
-                 */
-                int index = bits(addr[lane] + instData.OFFSET0, 7, 2);
-                panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is out "
-                         "of bounds.\n", gpuDynInst->disassemble(), index);
-                /**
-                 * If the shuffled index corresponds to a lane that is
-                 * inactive then this instruction writes a 0 to the active
-                 * lane in VDST.
-                 */
-                if (wf->execMask(index)) {
-                    vdst[index] = data[lane];
-                } else {
-                    vdst[index] = 0;
-                }
-            }
-        }
-
-        vdst.write();
-
-        /**
-         * This is needed because we treat this instruction as a load
-         * but it's not an actual memory request.
-         * Without this, the destination register never gets marked as
-         * free, leading to a  possible deadlock
-         */
-        wf->computeUnit->vrf[wf->simdId]->
-            scheduleWriteOperandsFromLoad(wf, gpuDynInst);
-        /**
-         * Similarly, this counter could build up over time, even across
-         * multiple wavefronts, and cause a deadlock.
-         */
-        wf->rdLmReqsInPipe--;
-    } // execute
-    // --- Inst_DS__DS_BPERMUTE_B32 class methods ---
-
-    Inst_DS__DS_BPERMUTE_B32::Inst_DS__DS_BPERMUTE_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_bpermute_b32")
-    {
-        setFlag(MemoryRef);
-        /**
-         * While this operation doesn't actually use DS storage we classify
-         * it as a load here because it does a writeback to a VGPR, which
-         * fits in better with the LDS pipeline logic.
-         */
-        setFlag(Load);
-    } // Inst_DS__DS_BPERMUTE_B32
-
-    Inst_DS__DS_BPERMUTE_B32::~Inst_DS__DS_BPERMUTE_B32()
-    {
-    } // ~Inst_DS__DS_BPERMUTE_B32
-
-    // --- description from .arch file ---
-    // Backward permute.
-    void
-    Inst_DS__DS_BPERMUTE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        wf->decLGKMInstsIssued();
-
-        if (gpuDynInst->exec_mask.none()) {
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()
-                                ->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        addr.read();
-        data.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                /**
-                 * One of the offset fields can be used for the index.
-                 * It is assumed OFFSET0 would be used, as OFFSET1 is
-                 * typically only used for DS ops that operate on two
-                 * disparate pieces of data.
-                 */
-                assert(!instData.OFFSET1);
-                /**
-                 * The address provided is a byte address, but VGPRs are
-                 * 4 bytes, so we must divide by 4 to get the actual VGPR
-                 * index. Additionally, the index is calculated modulo the
-                 * WF size, 64 in this case, so we simply extract bits 7-2.
-                 */
-                int index = bits(addr[lane] + instData.OFFSET0, 7, 2);
-                panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is out "
-                         "of bounds.\n", gpuDynInst->disassemble(), index);
-                /**
-                 * If the shuffled index corresponds to a lane that is
-                 * inactive then this instruction writes a 0 to the active
-                 * lane in VDST.
-                 */
-                if (wf->execMask(index)) {
-                    vdst[lane] = data[index];
-                } else {
-                    vdst[lane] = 0;
-                }
-            }
-        }
-
-        vdst.write();
-
-        /**
-         * This is needed because we treat this instruction as a load
-         * but it's not an actual memory request.
-         * Without this, the destination register never gets marked as
-         * free, leading to a  possible deadlock
-         */
-        wf->computeUnit->vrf[wf->simdId]->
-            scheduleWriteOperandsFromLoad(wf, gpuDynInst);
-        /**
-         * Similarly, this counter could build up over time, even across
-         * multiple wavefronts, and cause a deadlock.
-         */
-        wf->rdLmReqsInPipe--;
-    } // execute
-
-    // --- Inst_DS__DS_ADD_U64 class methods ---
-
-    Inst_DS__DS_ADD_U64::Inst_DS__DS_ADD_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_u64")
-    {
-        setFlag(MemoryRef);
-        setFlag(GroupSegment);
-        setFlag(AtomicAdd);
-        setFlag(AtomicNoReturn);
-    } // Inst_DS__DS_ADD_U64
-
-    Inst_DS__DS_ADD_U64::~Inst_DS__DS_ADD_U64()
-    {
-    } // ~Inst_DS__DS_ADD_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // MEM[ADDR] += DATA[0:1];
-    void
-    Inst_DS__DS_ADD_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU64 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU64*>(gpuDynInst->a_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_ADD_U64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initAtomicAccess<VecElemU64>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_ADD_U64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_SUB_U64 class methods ---
-
-    Inst_DS__DS_SUB_U64::Inst_DS__DS_SUB_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_sub_u64")
-    {
-    } // Inst_DS__DS_SUB_U64
-
-    Inst_DS__DS_SUB_U64::~Inst_DS__DS_SUB_U64()
-    {
-    } // ~Inst_DS__DS_SUB_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_SUB_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_RSUB_U64 class methods ---
-
-    Inst_DS__DS_RSUB_U64::Inst_DS__DS_RSUB_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_rsub_u64")
-    {
-    } // Inst_DS__DS_RSUB_U64
-
-    Inst_DS__DS_RSUB_U64::~Inst_DS__DS_RSUB_U64()
-    {
-    } // ~Inst_DS__DS_RSUB_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA - MEM[ADDR];
-    // RETURN_DATA = tmp.
-    // Subtraction with reversed operands.
-    void
-    Inst_DS__DS_RSUB_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_INC_U64 class methods ---
-
-    Inst_DS__DS_INC_U64::Inst_DS__DS_INC_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_inc_u64")
-    {
-    } // Inst_DS__DS_INC_U64
-
-    Inst_DS__DS_INC_U64::~Inst_DS__DS_INC_U64()
-    {
-    } // ~Inst_DS__DS_INC_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_INC_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_DEC_U64 class methods ---
-
-    Inst_DS__DS_DEC_U64::Inst_DS__DS_DEC_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_dec_u64")
-    {
-    } // Inst_DS__DS_DEC_U64
-
-    Inst_DS__DS_DEC_U64::~Inst_DS__DS_DEC_U64()
-    {
-    } // ~Inst_DS__DS_DEC_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
-    // (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_DEC_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_I64 class methods ---
-
-    Inst_DS__DS_MIN_I64::Inst_DS__DS_MIN_I64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_i64")
-    {
-    } // Inst_DS__DS_MIN_I64
-
-    Inst_DS__DS_MIN_I64::~Inst_DS__DS_MIN_I64()
-    {
-    } // ~Inst_DS__DS_MIN_I64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MIN_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_I64 class methods ---
-
-    Inst_DS__DS_MAX_I64::Inst_DS__DS_MAX_I64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_i64")
-    {
-    } // Inst_DS__DS_MAX_I64
-
-    Inst_DS__DS_MAX_I64::~Inst_DS__DS_MAX_I64()
-    {
-    } // ~Inst_DS__DS_MAX_I64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MAX_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_U64 class methods ---
-
-    Inst_DS__DS_MIN_U64::Inst_DS__DS_MIN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_u64")
-    {
-    } // Inst_DS__DS_MIN_U64
-
-    Inst_DS__DS_MIN_U64::~Inst_DS__DS_MIN_U64()
-    {
-    } // ~Inst_DS__DS_MIN_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MIN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_U64 class methods ---
-
-    Inst_DS__DS_MAX_U64::Inst_DS__DS_MAX_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_u64")
-    {
-    } // Inst_DS__DS_MAX_U64
-
-    Inst_DS__DS_MAX_U64::~Inst_DS__DS_MAX_U64()
-    {
-    } // ~Inst_DS__DS_MAX_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MAX_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_AND_B64 class methods ---
-
-    Inst_DS__DS_AND_B64::Inst_DS__DS_AND_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_and_b64")
-    {
-    } // Inst_DS__DS_AND_B64
-
-    Inst_DS__DS_AND_B64::~Inst_DS__DS_AND_B64()
-    {
-    } // ~Inst_DS__DS_AND_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_AND_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_OR_B64 class methods ---
-
-    Inst_DS__DS_OR_B64::Inst_DS__DS_OR_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_or_b64")
-    {
-    } // Inst_DS__DS_OR_B64
-
-    Inst_DS__DS_OR_B64::~Inst_DS__DS_OR_B64()
-    {
-    } // ~Inst_DS__DS_OR_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_OR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_XOR_B64 class methods ---
-
-    Inst_DS__DS_XOR_B64::Inst_DS__DS_XOR_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_xor_b64")
-    {
-    } // Inst_DS__DS_XOR_B64
-
-    Inst_DS__DS_XOR_B64::~Inst_DS__DS_XOR_B64()
-    {
-    } // ~Inst_DS__DS_XOR_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_XOR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MSKOR_B64 class methods ---
-
-    Inst_DS__DS_MSKOR_B64::Inst_DS__DS_MSKOR_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_mskor_b64")
-    {
-    } // Inst_DS__DS_MSKOR_B64
-
-    Inst_DS__DS_MSKOR_B64::~Inst_DS__DS_MSKOR_B64()
-    {
-    } // ~Inst_DS__DS_MSKOR_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
-    // RETURN_DATA = tmp.
-    // Masked dword OR, D0 contains the mask and D1 contains the new value.
-    void
-    Inst_DS__DS_MSKOR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRITE_B64 class methods ---
-
-    Inst_DS__DS_WRITE_B64::Inst_DS__DS_WRITE_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_b64")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_B64
-
-    Inst_DS__DS_WRITE_B64::~Inst_DS__DS_WRITE_B64()
-    {
-    } // ~Inst_DS__DS_WRITE_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // MEM[ADDR] = DATA.
-    // Write qword.
-    void
-    Inst_DS__DS_WRITE_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU64 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU64*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemWrite<VecElemU64>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_WRITE_B64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_WRITE2_B64 class methods ---
-
-    Inst_DS__DS_WRITE2_B64::Inst_DS__DS_WRITE2_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write2_b64")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE2_B64
-
-    Inst_DS__DS_WRITE2_B64::~Inst_DS__DS_WRITE2_B64()
-    {
-    } // ~Inst_DS__DS_WRITE2_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // MEM[ADDR_BASE + OFFSET0 * 8] = DATA;
-    // MEM[ADDR_BASE + OFFSET1 * 8] = DATA2.
-    // Write 2 qwords.
-    void
-    Inst_DS__DS_WRITE2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU64 data0(gpuDynInst, extData.DATA0);
-        ConstVecOperandU64 data1(gpuDynInst, extData.DATA1);
-
-        addr.read();
-        data0.read();
-        data1.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2] = data0[lane];
-                (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE2_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0 * 8;
-        Addr offset1 = instData.OFFSET1 * 8;
-
-        initDualMemWrite<VecElemU64>(gpuDynInst, offset0, offset1);
-    }
-
-    void
-    Inst_DS__DS_WRITE2_B64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-    // --- Inst_DS__DS_WRITE2ST64_B64 class methods ---
-
-    Inst_DS__DS_WRITE2ST64_B64::Inst_DS__DS_WRITE2ST64_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write2st64_b64")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE2ST64_B64
-
-    Inst_DS__DS_WRITE2ST64_B64::~Inst_DS__DS_WRITE2ST64_B64()
-    {
-    } // ~Inst_DS__DS_WRITE2ST64_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // MEM[ADDR_BASE + OFFSET0 * 8 * 64] = DATA;
-    // MEM[ADDR_BASE + OFFSET1 * 8 * 64] = DATA2;
-    // Write 2 qwords.
-    void
-    Inst_DS__DS_WRITE2ST64_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU64 data0(gpuDynInst, extData.DATA0);
-        ConstVecOperandU64 data1(gpuDynInst, extData.DATA1);
-
-        addr.read();
-        data0.read();
-        data1.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2] = data0[lane];
-                (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE2ST64_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0 * 8 * 64;
-        Addr offset1 = instData.OFFSET1 * 8 * 64;
-
-        initDualMemWrite<VecElemU64>(gpuDynInst, offset0, offset1);
-    }
-
-    void
-    Inst_DS__DS_WRITE2ST64_B64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-    // --- Inst_DS__DS_CMPST_B64 class methods ---
-
-    Inst_DS__DS_CMPST_B64::Inst_DS__DS_CMPST_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_b64")
-    {
-    } // Inst_DS__DS_CMPST_B64
-
-    Inst_DS__DS_CMPST_B64::~Inst_DS__DS_CMPST_B64()
-    {
-    } // ~Inst_DS__DS_CMPST_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    // Compare and store.
-    // Caution, the order of src and cmp are the *opposite* of the
-    // ---  BUFFER_ATOMIC_CMPSWAP_X2 opcode.
-    void
-    Inst_DS__DS_CMPST_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_CMPST_F64 class methods ---
-
-    Inst_DS__DS_CMPST_F64::Inst_DS__DS_CMPST_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_CMPST_F64
-
-    Inst_DS__DS_CMPST_F64::~Inst_DS__DS_CMPST_F64()
-    {
-    } // ~Inst_DS__DS_CMPST_F64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    // Floating point compare and store that handles NaN/INF/denormal values.
-    // Caution, the order of src and cmp are the *opposite* of the
-    // ---  BUFFER_ATOMIC_FCMPSWAP_X2 opcode.
-    void
-    Inst_DS__DS_CMPST_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_F64 class methods ---
-
-    Inst_DS__DS_MIN_F64::Inst_DS__DS_MIN_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_MIN_F64
-
-    Inst_DS__DS_MIN_F64::~Inst_DS__DS_MIN_F64()
-    {
-    } // ~Inst_DS__DS_MIN_F64
-
-    // --- description from .arch file ---
-    // 64b.
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
-    // Floating point minimum that handles NaN/INF/denormal values.
-    // Note that this opcode is slightly more general-purpose than
-    // ---  BUFFER_ATOMIC_FMIN_X2.
-    void
-    Inst_DS__DS_MIN_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_F64 class methods ---
-
-    Inst_DS__DS_MAX_F64::Inst_DS__DS_MAX_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_MAX_F64
-
-    Inst_DS__DS_MAX_F64::~Inst_DS__DS_MAX_F64()
-    {
-    } // ~Inst_DS__DS_MAX_F64
-
-    // --- description from .arch file ---
-    // 64b.
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
-    // Floating point maximum that handles NaN/INF/denormal values.
-    // Note that this opcode is slightly more general-purpose than
-    // ---  BUFFER_ATOMIC_FMAX_X2.
-    void
-    Inst_DS__DS_MAX_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_ADD_RTN_U64 class methods ---
-
-    Inst_DS__DS_ADD_RTN_U64::Inst_DS__DS_ADD_RTN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_rtn_u64")
-    {
-    } // Inst_DS__DS_ADD_RTN_U64
-
-    Inst_DS__DS_ADD_RTN_U64::~Inst_DS__DS_ADD_RTN_U64()
-    {
-    } // ~Inst_DS__DS_ADD_RTN_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_ADD_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_SUB_RTN_U64 class methods ---
-
-    Inst_DS__DS_SUB_RTN_U64::Inst_DS__DS_SUB_RTN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_sub_rtn_u64")
-    {
-    } // Inst_DS__DS_SUB_RTN_U64
-
-    Inst_DS__DS_SUB_RTN_U64::~Inst_DS__DS_SUB_RTN_U64()
-    {
-    } // ~Inst_DS__DS_SUB_RTN_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_SUB_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_RSUB_RTN_U64 class methods ---
-
-    Inst_DS__DS_RSUB_RTN_U64::Inst_DS__DS_RSUB_RTN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_rsub_rtn_u64")
-    {
-    } // Inst_DS__DS_RSUB_RTN_U64
-
-    Inst_DS__DS_RSUB_RTN_U64::~Inst_DS__DS_RSUB_RTN_U64()
-    {
-    } // ~Inst_DS__DS_RSUB_RTN_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA - MEM[ADDR];
-    // RETURN_DATA = tmp.
-    // Subtraction with reversed operands.
-    void
-    Inst_DS__DS_RSUB_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_INC_RTN_U64 class methods ---
-
-    Inst_DS__DS_INC_RTN_U64::Inst_DS__DS_INC_RTN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_inc_rtn_u64")
-    {
-    } // Inst_DS__DS_INC_RTN_U64
-
-    Inst_DS__DS_INC_RTN_U64::~Inst_DS__DS_INC_RTN_U64()
-    {
-    } // ~Inst_DS__DS_INC_RTN_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_INC_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_DEC_RTN_U64 class methods ---
-
-    Inst_DS__DS_DEC_RTN_U64::Inst_DS__DS_DEC_RTN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_dec_rtn_u64")
-    {
-    } // Inst_DS__DS_DEC_RTN_U64
-
-    Inst_DS__DS_DEC_RTN_U64::~Inst_DS__DS_DEC_RTN_U64()
-    {
-    } // ~Inst_DS__DS_DEC_RTN_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
-    // (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_DEC_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_RTN_I64 class methods ---
-
-    Inst_DS__DS_MIN_RTN_I64::Inst_DS__DS_MIN_RTN_I64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_rtn_i64")
-    {
-    } // Inst_DS__DS_MIN_RTN_I64
-
-    Inst_DS__DS_MIN_RTN_I64::~Inst_DS__DS_MIN_RTN_I64()
-    {
-    } // ~Inst_DS__DS_MIN_RTN_I64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MIN_RTN_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_RTN_I64 class methods ---
-
-    Inst_DS__DS_MAX_RTN_I64::Inst_DS__DS_MAX_RTN_I64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_rtn_i64")
-    {
-    } // Inst_DS__DS_MAX_RTN_I64
-
-    Inst_DS__DS_MAX_RTN_I64::~Inst_DS__DS_MAX_RTN_I64()
-    {
-    } // ~Inst_DS__DS_MAX_RTN_I64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MAX_RTN_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_RTN_U64 class methods ---
-
-    Inst_DS__DS_MIN_RTN_U64::Inst_DS__DS_MIN_RTN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_rtn_u64")
-    {
-    } // Inst_DS__DS_MIN_RTN_U64
-
-    Inst_DS__DS_MIN_RTN_U64::~Inst_DS__DS_MIN_RTN_U64()
-    {
-    } // ~Inst_DS__DS_MIN_RTN_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MIN_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_RTN_U64 class methods ---
-
-    Inst_DS__DS_MAX_RTN_U64::Inst_DS__DS_MAX_RTN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_rtn_u64")
-    {
-    } // Inst_DS__DS_MAX_RTN_U64
-
-    Inst_DS__DS_MAX_RTN_U64::~Inst_DS__DS_MAX_RTN_U64()
-    {
-    } // ~Inst_DS__DS_MAX_RTN_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MAX_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_AND_RTN_B64 class methods ---
-
-    Inst_DS__DS_AND_RTN_B64::Inst_DS__DS_AND_RTN_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_and_rtn_b64")
-    {
-    } // Inst_DS__DS_AND_RTN_B64
-
-    Inst_DS__DS_AND_RTN_B64::~Inst_DS__DS_AND_RTN_B64()
-    {
-    } // ~Inst_DS__DS_AND_RTN_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_AND_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_OR_RTN_B64 class methods ---
-
-    Inst_DS__DS_OR_RTN_B64::Inst_DS__DS_OR_RTN_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_or_rtn_b64")
-    {
-    } // Inst_DS__DS_OR_RTN_B64
-
-    Inst_DS__DS_OR_RTN_B64::~Inst_DS__DS_OR_RTN_B64()
-    {
-    } // ~Inst_DS__DS_OR_RTN_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_OR_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_XOR_RTN_B64 class methods ---
-
-    Inst_DS__DS_XOR_RTN_B64::Inst_DS__DS_XOR_RTN_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_xor_rtn_b64")
-    {
-    } // Inst_DS__DS_XOR_RTN_B64
-
-    Inst_DS__DS_XOR_RTN_B64::~Inst_DS__DS_XOR_RTN_B64()
-    {
-    } // ~Inst_DS__DS_XOR_RTN_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_XOR_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MSKOR_RTN_B64 class methods ---
-
-    Inst_DS__DS_MSKOR_RTN_B64::Inst_DS__DS_MSKOR_RTN_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_mskor_rtn_b64")
-    {
-    } // Inst_DS__DS_MSKOR_RTN_B64
-
-    Inst_DS__DS_MSKOR_RTN_B64::~Inst_DS__DS_MSKOR_RTN_B64()
-    {
-    } // ~Inst_DS__DS_MSKOR_RTN_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
-    // RETURN_DATA = tmp.
-    // Masked dword OR, D0 contains the mask and D1 contains the new value.
-    void
-    Inst_DS__DS_MSKOR_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRXCHG_RTN_B64 class methods ---
-
-    Inst_DS__DS_WRXCHG_RTN_B64::Inst_DS__DS_WRXCHG_RTN_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_wrxchg_rtn_b64")
-    {
-    } // Inst_DS__DS_WRXCHG_RTN_B64
-
-    Inst_DS__DS_WRXCHG_RTN_B64::~Inst_DS__DS_WRXCHG_RTN_B64()
-    {
-    } // ~Inst_DS__DS_WRXCHG_RTN_B64
-
-    // --- description from .arch file ---
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA;
-    // RETURN_DATA = tmp.
-    // Write-exchange operation.
-    void
-    Inst_DS__DS_WRXCHG_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRXCHG2_RTN_B64 class methods ---
-
-    Inst_DS__DS_WRXCHG2_RTN_B64::Inst_DS__DS_WRXCHG2_RTN_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_wrxchg2_rtn_b64")
-    {
-    } // Inst_DS__DS_WRXCHG2_RTN_B64
-
-    Inst_DS__DS_WRXCHG2_RTN_B64::~Inst_DS__DS_WRXCHG2_RTN_B64()
-    {
-    } // ~Inst_DS__DS_WRXCHG2_RTN_B64
-
-    // --- description from .arch file ---
-    // Write-exchange 2 separate qwords.
-    void
-    Inst_DS__DS_WRXCHG2_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRXCHG2ST64_RTN_B64 class methods ---
-
-    Inst_DS__DS_WRXCHG2ST64_RTN_B64::Inst_DS__DS_WRXCHG2ST64_RTN_B64(
-          InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_wrxchg2st64_rtn_b64")
-    {
-    } // Inst_DS__DS_WRXCHG2ST64_RTN_B64
-
-    Inst_DS__DS_WRXCHG2ST64_RTN_B64::~Inst_DS__DS_WRXCHG2ST64_RTN_B64()
-    {
-    } // ~Inst_DS__DS_WRXCHG2ST64_RTN_B64
-
-    // --- description from .arch file ---
-    // Write-exchange 2 qwords with a stride of 64 qwords.
-    void
-    Inst_DS__DS_WRXCHG2ST64_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_CMPST_RTN_B64 class methods ---
-
-    Inst_DS__DS_CMPST_RTN_B64::Inst_DS__DS_CMPST_RTN_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_rtn_b64")
-    {
-    } // Inst_DS__DS_CMPST_RTN_B64
-
-    Inst_DS__DS_CMPST_RTN_B64::~Inst_DS__DS_CMPST_RTN_B64()
-    {
-    } // ~Inst_DS__DS_CMPST_RTN_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    // Compare and store.
-    // Caution, the order of src and cmp are the *opposite* of the
-    // ---  BUFFER_ATOMIC_CMPSWAP_X2 opcode.
-    void
-    Inst_DS__DS_CMPST_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_CMPST_RTN_F64 class methods ---
-
-    Inst_DS__DS_CMPST_RTN_F64::Inst_DS__DS_CMPST_RTN_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_rtn_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_CMPST_RTN_F64
-
-    Inst_DS__DS_CMPST_RTN_F64::~Inst_DS__DS_CMPST_RTN_F64()
-    {
-    } // ~Inst_DS__DS_CMPST_RTN_F64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    // Floating point compare and store that handles NaN/INF/denormal values.
-    // Caution, the order of src and cmp are the *opposite* of the
-    // ---  BUFFER_ATOMIC_FCMPSWAP_X2 opcode.
-    void
-    Inst_DS__DS_CMPST_RTN_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_RTN_F64 class methods ---
-
-    Inst_DS__DS_MIN_RTN_F64::Inst_DS__DS_MIN_RTN_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_rtn_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_MIN_RTN_F64
-
-    Inst_DS__DS_MIN_RTN_F64::~Inst_DS__DS_MIN_RTN_F64()
-    {
-    } // ~Inst_DS__DS_MIN_RTN_F64
-
-    // --- description from .arch file ---
-    // 64b.
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
-    // Floating point minimum that handles NaN/INF/denormal values.
-    // Note that this opcode is slightly more general-purpose than
-    // ---  BUFFER_ATOMIC_FMIN_X2.
-    void
-    Inst_DS__DS_MIN_RTN_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_RTN_F64 class methods ---
-
-    Inst_DS__DS_MAX_RTN_F64::Inst_DS__DS_MAX_RTN_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_rtn_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_MAX_RTN_F64
-
-    Inst_DS__DS_MAX_RTN_F64::~Inst_DS__DS_MAX_RTN_F64()
-    {
-    } // ~Inst_DS__DS_MAX_RTN_F64
-
-    // --- description from .arch file ---
-    // 64b.
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
-    // Floating point maximum that handles NaN/INF/denormal values.
-    // Note that this opcode is slightly more general-purpose than
-    // ---  BUFFER_ATOMIC_FMAX_X2.
-    void
-    Inst_DS__DS_MAX_RTN_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_READ_B64 class methods ---
-
-    Inst_DS__DS_READ_B64::Inst_DS__DS_READ_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_b64")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_B64
-
-    Inst_DS__DS_READ_B64::~Inst_DS__DS_READ_B64()
-    {
-    } // ~Inst_DS__DS_READ_B64
-
-    // --- description from .arch file ---
-    // RETURN_DATA = MEM[ADDR].
-    // Read 1 qword.
-    void
-    Inst_DS__DS_READ_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemRead<VecElemU64>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ_B64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU64 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane];
-            }
-        }
-
-        vdst.write();
-    } // completeAcc
-    // --- Inst_DS__DS_READ2_B64 class methods ---
-
-    Inst_DS__DS_READ2_B64::Inst_DS__DS_READ2_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read2_b64")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ2_B64
-
-    Inst_DS__DS_READ2_B64::~Inst_DS__DS_READ2_B64()
-    {
-    } // ~Inst_DS__DS_READ2_B64
-
-    // --- description from .arch file ---
-    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 8];
-    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 8].
-    // Read 2 qwords.
-    void
-    Inst_DS__DS_READ2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ2_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0 * 8;
-        Addr offset1 = instData.OFFSET1 * 8;
-
-        initDualMemRead<VecElemU64>(gpuDynInst, offset0, offset1);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ2_B64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU64 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU64 vdst1(gpuDynInst, extData.VDST + 2);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2];
-                vdst1[lane] = (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2 + 1];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-    } // completeAcc
-    // --- Inst_DS__DS_READ2ST64_B64 class methods ---
-
-    Inst_DS__DS_READ2ST64_B64::Inst_DS__DS_READ2ST64_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read2st64_b64")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ2ST64_B64
-
-    Inst_DS__DS_READ2ST64_B64::~Inst_DS__DS_READ2ST64_B64()
-    {
-    } // ~Inst_DS__DS_READ2ST64_B64
-
-    // --- description from .arch file ---
-    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 8 * 64];
-    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 8 * 64].
-    // Read 2 qwords.
-    void
-    Inst_DS__DS_READ2ST64_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ2ST64_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = (instData.OFFSET0 * 8 * 64);
-        Addr offset1 = (instData.OFFSET1 * 8 * 64);
-
-        initDualMemRead<VecElemU64>(gpuDynInst, offset0, offset1);
-    }
-
-    void
-    Inst_DS__DS_READ2ST64_B64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU64 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU64 vdst1(gpuDynInst, extData.VDST + 2);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2];
-                vdst1[lane] = (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2 + 1];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-    }
-    // --- Inst_DS__DS_CONDXCHG32_RTN_B64 class methods ---
-
-    Inst_DS__DS_CONDXCHG32_RTN_B64::Inst_DS__DS_CONDXCHG32_RTN_B64(
-          InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_condxchg32_rtn_b64")
-    {
-    } // Inst_DS__DS_CONDXCHG32_RTN_B64
-
-    Inst_DS__DS_CONDXCHG32_RTN_B64::~Inst_DS__DS_CONDXCHG32_RTN_B64()
-    {
-    } // ~Inst_DS__DS_CONDXCHG32_RTN_B64
-
-    // --- description from .arch file ---
-    // Conditional write exchange.
-    void
-    Inst_DS__DS_CONDXCHG32_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_ADD_SRC2_U32 class methods ---
-
-    Inst_DS__DS_ADD_SRC2_U32::Inst_DS__DS_ADD_SRC2_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_src2_u32")
-    {
-    } // Inst_DS__DS_ADD_SRC2_U32
-
-    Inst_DS__DS_ADD_SRC2_U32::~Inst_DS__DS_ADD_SRC2_U32()
-    {
-    } // ~Inst_DS__DS_ADD_SRC2_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] + MEM[B].
-    void
-    Inst_DS__DS_ADD_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_SUB_SRC2_U32 class methods ---
-
-    Inst_DS__DS_SUB_SRC2_U32::Inst_DS__DS_SUB_SRC2_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_sub_src2_u32")
-    {
-    } // Inst_DS__DS_SUB_SRC2_U32
-
-    Inst_DS__DS_SUB_SRC2_U32::~Inst_DS__DS_SUB_SRC2_U32()
-    {
-    } // ~Inst_DS__DS_SUB_SRC2_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] - MEM[B].
-    void
-    Inst_DS__DS_SUB_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_RSUB_SRC2_U32 class methods ---
-
-    Inst_DS__DS_RSUB_SRC2_U32::Inst_DS__DS_RSUB_SRC2_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_rsub_src2_u32")
-    {
-    } // Inst_DS__DS_RSUB_SRC2_U32
-
-    Inst_DS__DS_RSUB_SRC2_U32::~Inst_DS__DS_RSUB_SRC2_U32()
-    {
-    } // ~Inst_DS__DS_RSUB_SRC2_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[B] - MEM[A].
-    void
-    Inst_DS__DS_RSUB_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_INC_SRC2_U32 class methods ---
-
-    Inst_DS__DS_INC_SRC2_U32::Inst_DS__DS_INC_SRC2_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_inc_src2_u32")
-    {
-    } // Inst_DS__DS_INC_SRC2_U32
-
-    Inst_DS__DS_INC_SRC2_U32::~Inst_DS__DS_INC_SRC2_U32()
-    {
-    } // ~Inst_DS__DS_INC_SRC2_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[A] >= MEM[B] ? 0 : MEM[A] + 1).
-    void
-    Inst_DS__DS_INC_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_DEC_SRC2_U32 class methods ---
-
-    Inst_DS__DS_DEC_SRC2_U32::Inst_DS__DS_DEC_SRC2_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_dec_src2_u32")
-    {
-    } // Inst_DS__DS_DEC_SRC2_U32
-
-    Inst_DS__DS_DEC_SRC2_U32::~Inst_DS__DS_DEC_SRC2_U32()
-    {
-    } // ~Inst_DS__DS_DEC_SRC2_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[A] == 0 || MEM[A] > MEM[B] ? MEM[B] : MEM[A] - 1).
-    // Uint decrement.
-    void
-    Inst_DS__DS_DEC_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_SRC2_I32 class methods ---
-
-    Inst_DS__DS_MIN_SRC2_I32::Inst_DS__DS_MIN_SRC2_I32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_src2_i32")
-    {
-    } // Inst_DS__DS_MIN_SRC2_I32
-
-    Inst_DS__DS_MIN_SRC2_I32::~Inst_DS__DS_MIN_SRC2_I32()
-    {
-    } // ~Inst_DS__DS_MIN_SRC2_I32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = min(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MIN_SRC2_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_SRC2_I32 class methods ---
-
-    Inst_DS__DS_MAX_SRC2_I32::Inst_DS__DS_MAX_SRC2_I32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_src2_i32")
-    {
-    } // Inst_DS__DS_MAX_SRC2_I32
-
-    Inst_DS__DS_MAX_SRC2_I32::~Inst_DS__DS_MAX_SRC2_I32()
-    {
-    } // ~Inst_DS__DS_MAX_SRC2_I32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = max(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MAX_SRC2_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_SRC2_U32 class methods ---
-
-    Inst_DS__DS_MIN_SRC2_U32::Inst_DS__DS_MIN_SRC2_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_src2_u32")
-    {
-    } // Inst_DS__DS_MIN_SRC2_U32
-
-    Inst_DS__DS_MIN_SRC2_U32::~Inst_DS__DS_MIN_SRC2_U32()
-    {
-    } // ~Inst_DS__DS_MIN_SRC2_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = min(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MIN_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_SRC2_U32 class methods ---
-
-    Inst_DS__DS_MAX_SRC2_U32::Inst_DS__DS_MAX_SRC2_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_src2_u32")
-    {
-    } // Inst_DS__DS_MAX_SRC2_U32
-
-    Inst_DS__DS_MAX_SRC2_U32::~Inst_DS__DS_MAX_SRC2_U32()
-    {
-    } // ~Inst_DS__DS_MAX_SRC2_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = max(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MAX_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_AND_SRC2_B32 class methods ---
-
-    Inst_DS__DS_AND_SRC2_B32::Inst_DS__DS_AND_SRC2_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_and_src2_b32")
-    {
-    } // Inst_DS__DS_AND_SRC2_B32
-
-    Inst_DS__DS_AND_SRC2_B32::~Inst_DS__DS_AND_SRC2_B32()
-    {
-    } // ~Inst_DS__DS_AND_SRC2_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] & MEM[B].
-    void
-    Inst_DS__DS_AND_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_OR_SRC2_B32 class methods ---
-
-    Inst_DS__DS_OR_SRC2_B32::Inst_DS__DS_OR_SRC2_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_or_src2_b32")
-    {
-    } // Inst_DS__DS_OR_SRC2_B32
-
-    Inst_DS__DS_OR_SRC2_B32::~Inst_DS__DS_OR_SRC2_B32()
-    {
-    } // ~Inst_DS__DS_OR_SRC2_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] | MEM[B].
-    void
-    Inst_DS__DS_OR_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_XOR_SRC2_B32 class methods ---
-
-    Inst_DS__DS_XOR_SRC2_B32::Inst_DS__DS_XOR_SRC2_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_xor_src2_b32")
-    {
-    } // Inst_DS__DS_XOR_SRC2_B32
-
-    Inst_DS__DS_XOR_SRC2_B32::~Inst_DS__DS_XOR_SRC2_B32()
-    {
-    } // ~Inst_DS__DS_XOR_SRC2_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] ^ MEM[B].
-    void
-    Inst_DS__DS_XOR_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRITE_SRC2_B32 class methods ---
-
-    Inst_DS__DS_WRITE_SRC2_B32::Inst_DS__DS_WRITE_SRC2_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_src2_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_SRC2_B32
-
-    Inst_DS__DS_WRITE_SRC2_B32::~Inst_DS__DS_WRITE_SRC2_B32()
-    {
-    } // ~Inst_DS__DS_WRITE_SRC2_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[B].
-    // Write dword.
-    void
-    Inst_DS__DS_WRITE_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_SRC2_F32 class methods ---
-
-    Inst_DS__DS_MIN_SRC2_F32::Inst_DS__DS_MIN_SRC2_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_src2_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_MIN_SRC2_F32
-
-    Inst_DS__DS_MIN_SRC2_F32::~Inst_DS__DS_MIN_SRC2_F32()
-    {
-    } // ~Inst_DS__DS_MIN_SRC2_F32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[B] < MEM[A]) ? MEM[B] : MEM[A].
-    // Float, handles NaN/INF/denorm.
-    void
-    Inst_DS__DS_MIN_SRC2_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_SRC2_F32 class methods ---
-
-    Inst_DS__DS_MAX_SRC2_F32::Inst_DS__DS_MAX_SRC2_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_src2_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_MAX_SRC2_F32
-
-    Inst_DS__DS_MAX_SRC2_F32::~Inst_DS__DS_MAX_SRC2_F32()
-    {
-    } // ~Inst_DS__DS_MAX_SRC2_F32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[B] > MEM[A]) ? MEM[B] : MEM[A].
-    // Float, handles NaN/INF/denorm.
-    void
-    Inst_DS__DS_MAX_SRC2_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_ADD_SRC2_F32 class methods ---
-
-    Inst_DS__DS_ADD_SRC2_F32::Inst_DS__DS_ADD_SRC2_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_src2_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_ADD_SRC2_F32
-
-    Inst_DS__DS_ADD_SRC2_F32::~Inst_DS__DS_ADD_SRC2_F32()
-    {
-    } // ~Inst_DS__DS_ADD_SRC2_F32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[B] + MEM[A].
-    // Float, handles NaN/INF/denorm.
-    void
-    Inst_DS__DS_ADD_SRC2_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_GWS_SEMA_RELEASE_ALL class methods ---
-
-    Inst_DS__DS_GWS_SEMA_RELEASE_ALL::Inst_DS__DS_GWS_SEMA_RELEASE_ALL(
-          InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_gws_sema_release_all")
-    {
-    } // Inst_DS__DS_GWS_SEMA_RELEASE_ALL
-
-    Inst_DS__DS_GWS_SEMA_RELEASE_ALL::~Inst_DS__DS_GWS_SEMA_RELEASE_ALL()
-    {
-    } // ~Inst_DS__DS_GWS_SEMA_RELEASE_ALL
-
-    // --- description from .arch file ---
-    // GDS Only: The GWS resource (rid) indicated will process this opcode by
-    // updating the counter and labeling the specified resource as a semaphore.
-    // //Determine the GWS resource to work on
-    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
-    // //Incr the state counter of the resource
-    // state.counter[rid] = state.wave_in_queue;
-    // state.type = SEMAPHORE;
-    // return rd_done; //release calling wave
-    // This action will release ALL queued waves; it Will have no effect if no
-    // ---  waves are present.
-    void
-    Inst_DS__DS_GWS_SEMA_RELEASE_ALL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_GWS_INIT class methods ---
-
-    Inst_DS__DS_GWS_INIT::Inst_DS__DS_GWS_INIT(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_gws_init")
-    {
-    } // Inst_DS__DS_GWS_INIT
-
-    Inst_DS__DS_GWS_INIT::~Inst_DS__DS_GWS_INIT()
-    {
-    } // ~Inst_DS__DS_GWS_INIT
-
-    // --- description from .arch file ---
-    // GDS Only: Initialize a barrier or semaphore resource.
-    // //Determine the GWS resource to work on
-    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
-    // //Get the value to use in init
-    // index = find_first_valid(vector mask)
-    // value = DATA[thread: index]
-    // //Set the state of the resource
-    // state.counter[rid] = lsb(value); //limit #waves
-    // state.flag[rid] = 0;
-    // return rd_done; //release calling wave
-    void
-    Inst_DS__DS_GWS_INIT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_GWS_SEMA_V class methods ---
-
-    Inst_DS__DS_GWS_SEMA_V::Inst_DS__DS_GWS_SEMA_V(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_gws_sema_v")
-    {
-    } // Inst_DS__DS_GWS_SEMA_V
-
-    Inst_DS__DS_GWS_SEMA_V::~Inst_DS__DS_GWS_SEMA_V()
-    {
-    } // ~Inst_DS__DS_GWS_SEMA_V
-
-    // --- description from .arch file ---
-    // GDS Only: The GWS resource indicated will process this opcode by
-    // updating the counter and labeling the resource as a semaphore.
-    // //Determine the GWS resource to work on
-    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
-    // //Incr the state counter of the resource
-    // state.counter[rid]++;
-    // state.type = SEMAPHORE;
-    // return rd_done; //release calling wave
-    // This action will release one waved if any are queued in this resource.
-    void
-    Inst_DS__DS_GWS_SEMA_V::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_GWS_SEMA_BR class methods ---
-
-    Inst_DS__DS_GWS_SEMA_BR::Inst_DS__DS_GWS_SEMA_BR(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_gws_sema_br")
-    {
-    } // Inst_DS__DS_GWS_SEMA_BR
-
-    Inst_DS__DS_GWS_SEMA_BR::~Inst_DS__DS_GWS_SEMA_BR()
-    {
-    } // ~Inst_DS__DS_GWS_SEMA_BR
-
-    // --- description from .arch file ---
-    // GDS Only: The GWS resource indicated will process this opcode by
-    // updating the counter by the bulk release delivered count and labeling
-    // the resource as a semaphore.
-    // //Determine the GWS resource to work on
-    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
-    // index =  find first valid (vector mask)
-    // count = DATA[thread: index];
-    // //Add count to the resource state counter
-    // state.counter[rid] += count;
-    // state.type = SEMAPHORE;
-    // return rd_done; //release calling wave
-    // This action will release count number of waves, immediately if queued,
-    // or as they arrive from the noted resource.
-    void
-    Inst_DS__DS_GWS_SEMA_BR::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_GWS_SEMA_P class methods ---
-
-    Inst_DS__DS_GWS_SEMA_P::Inst_DS__DS_GWS_SEMA_P(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_gws_sema_p")
-    {
-    } // Inst_DS__DS_GWS_SEMA_P
-
-    Inst_DS__DS_GWS_SEMA_P::~Inst_DS__DS_GWS_SEMA_P()
-    {
-    } // ~Inst_DS__DS_GWS_SEMA_P
-
-    // --- description from .arch file ---
-    // GDS Only: The GWS resource indicated will process this opcode by
-    // queueing it until counter enables a release and then decrementing the
-    // counter of the resource as a semaphore.
-    // //Determine the GWS resource to work on
-    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
-    // state.type = SEMAPHORE;
-    // ENQUEUE until(state[rid].counter > 0)
-    // state[rid].counter--;
-    // return rd_done
-    void
-    Inst_DS__DS_GWS_SEMA_P::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_GWS_BARRIER class methods ---
-
-    Inst_DS__DS_GWS_BARRIER::Inst_DS__DS_GWS_BARRIER(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_gws_barrier")
-    {
-    } // Inst_DS__DS_GWS_BARRIER
-
-    Inst_DS__DS_GWS_BARRIER::~Inst_DS__DS_GWS_BARRIER()
-    {
-    } // ~Inst_DS__DS_GWS_BARRIER
-
-    // --- description from .arch file ---
-    // GDS Only: The GWS resource indicated will process this opcode by
-    // queueing it until barrier is satisfied. The number of waves needed is
-    // passed in as DATA of first valid thread.
-    // //Determine the GWS resource to work on
-    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + OFFSET0[5:0];
-    // index =  find first valid (vector mask);
-    // value = DATA[thread: index];
-    // // Input Decision Machine
-    // state.type[rid] = BARRIER;
-    // if(state[rid].counter <= 0) {
-    //     thread[rid].flag = state[rid].flag;
-    //     ENQUEUE;
-    //     state[rid].flag = !state.flag;
-    //     state[rid].counter = value;
-    //     return rd_done;
-    // } else {
-    //     state[rid].counter--;
-    //     thread.flag = state[rid].flag;
-    //     ENQUEUE;
-    // }
-    // Since the waves deliver the count for the next barrier, this function
-    // can have a different size barrier for each occurrence.
-    // // Release Machine
-    // if(state.type == BARRIER) {
-    //     if(state.flag != thread.flag) {
-    //         return rd_done;
-    //     }
-    // }
-    void
-    Inst_DS__DS_GWS_BARRIER::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_CONSUME class methods ---
-
-    Inst_DS__DS_CONSUME::Inst_DS__DS_CONSUME(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_consume")
-    {
-    } // Inst_DS__DS_CONSUME
-
-    Inst_DS__DS_CONSUME::~Inst_DS__DS_CONSUME()
-    {
-    } // ~Inst_DS__DS_CONSUME
-
-    // --- description from .arch file ---
-    // LDS & GDS. Subtract (count_bits(exec_mask)) from the value stored in DS
-    // memory at (M0.base + instr_offset). Return the pre-operation value to
-    // VGPRs.
-    void
-    Inst_DS__DS_CONSUME::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_APPEND class methods ---
-
-    Inst_DS__DS_APPEND::Inst_DS__DS_APPEND(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_append")
-    {
-    } // Inst_DS__DS_APPEND
-
-    Inst_DS__DS_APPEND::~Inst_DS__DS_APPEND()
-    {
-    } // ~Inst_DS__DS_APPEND
-
-    // --- description from .arch file ---
-    // LDS & GDS. Add (count_bits(exec_mask)) to the value stored in DS memory
-    // at (M0.base + instr_offset). Return the pre-operation value to VGPRs.
-    void
-    Inst_DS__DS_APPEND::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_ORDERED_COUNT class methods ---
-
-    Inst_DS__DS_ORDERED_COUNT::Inst_DS__DS_ORDERED_COUNT(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_ordered_count")
-    {
-    } // Inst_DS__DS_ORDERED_COUNT
-
-    Inst_DS__DS_ORDERED_COUNT::~Inst_DS__DS_ORDERED_COUNT()
-    {
-    } // ~Inst_DS__DS_ORDERED_COUNT
-
-    // --- description from .arch file ---
-    // GDS-only. Add (count_bits(exec_mask)) to one of 4 dedicated
-    // ordered-count counters (aka 'packers'). Additional bits of instr.offset
-    // field are overloaded to hold packer-id, 'last'.
-    void
-    Inst_DS__DS_ORDERED_COUNT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_ADD_SRC2_U64 class methods ---
-
-    Inst_DS__DS_ADD_SRC2_U64::Inst_DS__DS_ADD_SRC2_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_src2_u64")
-    {
-    } // Inst_DS__DS_ADD_SRC2_U64
-
-    Inst_DS__DS_ADD_SRC2_U64::~Inst_DS__DS_ADD_SRC2_U64()
-    {
-    } // ~Inst_DS__DS_ADD_SRC2_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] + MEM[B].
-    void
-    Inst_DS__DS_ADD_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_SUB_SRC2_U64 class methods ---
-
-    Inst_DS__DS_SUB_SRC2_U64::Inst_DS__DS_SUB_SRC2_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_sub_src2_u64")
-    {
-    } // Inst_DS__DS_SUB_SRC2_U64
-
-    Inst_DS__DS_SUB_SRC2_U64::~Inst_DS__DS_SUB_SRC2_U64()
-    {
-    } // ~Inst_DS__DS_SUB_SRC2_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] - MEM[B].
-    void
-    Inst_DS__DS_SUB_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_RSUB_SRC2_U64 class methods ---
-
-    Inst_DS__DS_RSUB_SRC2_U64::Inst_DS__DS_RSUB_SRC2_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_rsub_src2_u64")
-    {
-    } // Inst_DS__DS_RSUB_SRC2_U64
-
-    Inst_DS__DS_RSUB_SRC2_U64::~Inst_DS__DS_RSUB_SRC2_U64()
-    {
-    } // ~Inst_DS__DS_RSUB_SRC2_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[B] - MEM[A].
-    void
-    Inst_DS__DS_RSUB_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_INC_SRC2_U64 class methods ---
-
-    Inst_DS__DS_INC_SRC2_U64::Inst_DS__DS_INC_SRC2_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_inc_src2_u64")
-    {
-    } // Inst_DS__DS_INC_SRC2_U64
-
-    Inst_DS__DS_INC_SRC2_U64::~Inst_DS__DS_INC_SRC2_U64()
-    {
-    } // ~Inst_DS__DS_INC_SRC2_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[A] >= MEM[B] ? 0 : MEM[A] + 1).
-    void
-    Inst_DS__DS_INC_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_DEC_SRC2_U64 class methods ---
-
-    Inst_DS__DS_DEC_SRC2_U64::Inst_DS__DS_DEC_SRC2_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_dec_src2_u64")
-    {
-    } // Inst_DS__DS_DEC_SRC2_U64
-
-    Inst_DS__DS_DEC_SRC2_U64::~Inst_DS__DS_DEC_SRC2_U64()
-    {
-    } // ~Inst_DS__DS_DEC_SRC2_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[A] == 0 || MEM[A] > MEM[B] ? MEM[B] : MEM[A] - 1).
-    // Uint decrement.
-    void
-    Inst_DS__DS_DEC_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_SRC2_I64 class methods ---
-
-    Inst_DS__DS_MIN_SRC2_I64::Inst_DS__DS_MIN_SRC2_I64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_src2_i64")
-    {
-    } // Inst_DS__DS_MIN_SRC2_I64
-
-    Inst_DS__DS_MIN_SRC2_I64::~Inst_DS__DS_MIN_SRC2_I64()
-    {
-    } // ~Inst_DS__DS_MIN_SRC2_I64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = min(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MIN_SRC2_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_SRC2_I64 class methods ---
-
-    Inst_DS__DS_MAX_SRC2_I64::Inst_DS__DS_MAX_SRC2_I64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_src2_i64")
-    {
-    } // Inst_DS__DS_MAX_SRC2_I64
-
-    Inst_DS__DS_MAX_SRC2_I64::~Inst_DS__DS_MAX_SRC2_I64()
-    {
-    } // ~Inst_DS__DS_MAX_SRC2_I64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = max(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MAX_SRC2_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_SRC2_U64 class methods ---
-
-    Inst_DS__DS_MIN_SRC2_U64::Inst_DS__DS_MIN_SRC2_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_src2_u64")
-    {
-    } // Inst_DS__DS_MIN_SRC2_U64
-
-    Inst_DS__DS_MIN_SRC2_U64::~Inst_DS__DS_MIN_SRC2_U64()
-    {
-    } // ~Inst_DS__DS_MIN_SRC2_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = min(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MIN_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_SRC2_U64 class methods ---
-
-    Inst_DS__DS_MAX_SRC2_U64::Inst_DS__DS_MAX_SRC2_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_src2_u64")
-    {
-    } // Inst_DS__DS_MAX_SRC2_U64
-
-    Inst_DS__DS_MAX_SRC2_U64::~Inst_DS__DS_MAX_SRC2_U64()
-    {
-    } // ~Inst_DS__DS_MAX_SRC2_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = max(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MAX_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_AND_SRC2_B64 class methods ---
-
-    Inst_DS__DS_AND_SRC2_B64::Inst_DS__DS_AND_SRC2_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_and_src2_b64")
-    {
-    } // Inst_DS__DS_AND_SRC2_B64
-
-    Inst_DS__DS_AND_SRC2_B64::~Inst_DS__DS_AND_SRC2_B64()
-    {
-    } // ~Inst_DS__DS_AND_SRC2_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] & MEM[B].
-    void
-    Inst_DS__DS_AND_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_OR_SRC2_B64 class methods ---
-
-    Inst_DS__DS_OR_SRC2_B64::Inst_DS__DS_OR_SRC2_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_or_src2_b64")
-    {
-    } // Inst_DS__DS_OR_SRC2_B64
-
-    Inst_DS__DS_OR_SRC2_B64::~Inst_DS__DS_OR_SRC2_B64()
-    {
-    } // ~Inst_DS__DS_OR_SRC2_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] | MEM[B].
-    void
-    Inst_DS__DS_OR_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_XOR_SRC2_B64 class methods ---
-
-    Inst_DS__DS_XOR_SRC2_B64::Inst_DS__DS_XOR_SRC2_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_xor_src2_b64")
-    {
-    } // Inst_DS__DS_XOR_SRC2_B64
-
-    Inst_DS__DS_XOR_SRC2_B64::~Inst_DS__DS_XOR_SRC2_B64()
-    {
-    } // ~Inst_DS__DS_XOR_SRC2_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] ^ MEM[B].
-    void
-    Inst_DS__DS_XOR_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRITE_SRC2_B64 class methods ---
-
-    Inst_DS__DS_WRITE_SRC2_B64::Inst_DS__DS_WRITE_SRC2_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_src2_b64")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_SRC2_B64
-
-    Inst_DS__DS_WRITE_SRC2_B64::~Inst_DS__DS_WRITE_SRC2_B64()
-    {
-    } // ~Inst_DS__DS_WRITE_SRC2_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[B].
-    // Write qword.
-    void
-    Inst_DS__DS_WRITE_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_SRC2_F64 class methods ---
-
-    Inst_DS__DS_MIN_SRC2_F64::Inst_DS__DS_MIN_SRC2_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_src2_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_MIN_SRC2_F64
-
-    Inst_DS__DS_MIN_SRC2_F64::~Inst_DS__DS_MIN_SRC2_F64()
-    {
-    } // ~Inst_DS__DS_MIN_SRC2_F64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[B] < MEM[A]) ? MEM[B] : MEM[A].
-    // Float, handles NaN/INF/denorm.
-    void
-    Inst_DS__DS_MIN_SRC2_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_SRC2_F64 class methods ---
-
-    Inst_DS__DS_MAX_SRC2_F64::Inst_DS__DS_MAX_SRC2_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_src2_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_MAX_SRC2_F64
-
-    Inst_DS__DS_MAX_SRC2_F64::~Inst_DS__DS_MAX_SRC2_F64()
-    {
-    } // ~Inst_DS__DS_MAX_SRC2_F64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[B] > MEM[A]) ? MEM[B] : MEM[A].
-    // Float, handles NaN/INF/denorm.
-    void
-    Inst_DS__DS_MAX_SRC2_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRITE_B96 class methods ---
-
-    Inst_DS__DS_WRITE_B96::Inst_DS__DS_WRITE_B96(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_b96")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_B96
-
-    Inst_DS__DS_WRITE_B96::~Inst_DS__DS_WRITE_B96()
-    {
-    } // ~Inst_DS__DS_WRITE_B96
-
-    // --- description from .arch file ---
-    // {MEM[ADDR + 8], MEM[ADDR + 4], MEM[ADDR]} = DATA[95:0].
-    // Tri-dword write.
-    void
-    Inst_DS__DS_WRITE_B96::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
-        ConstVecOperandU32 data1(gpuDynInst, extData.DATA0 + 1);
-        ConstVecOperandU32 data2(gpuDynInst, extData.DATA0 + 2);
-
-        addr.read();
-        data0.read();
-        data1.read();
-        data2.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4] = data0[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE_B96::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemWrite<3>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_WRITE_B96::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_WRITE_B128 class methods ---
-
-    Inst_DS__DS_WRITE_B128::Inst_DS__DS_WRITE_B128(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_b128")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_B128
-
-    Inst_DS__DS_WRITE_B128::~Inst_DS__DS_WRITE_B128()
-    {
-    } // ~Inst_DS__DS_WRITE_B128
-
-    // --- description from .arch file ---
-    // {MEM[ADDR + 12], MEM[ADDR + 8], MEM[ADDR + 4], MEM[ADDR]} = DATA[127:0].
-    // Qword write.
-    void
-    Inst_DS__DS_WRITE_B128::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
-        ConstVecOperandU32 data1(gpuDynInst, extData.DATA0 + 1);
-        ConstVecOperandU32 data2(gpuDynInst, extData.DATA0 + 2);
-        ConstVecOperandU32 data3(gpuDynInst, extData.DATA0 + 3);
-
-        addr.read();
-        data0.read();
-        data1.read();
-        data2.read();
-        data3.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4] = data0[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 3] = data3[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE_B128::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemWrite<4>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_WRITE_B128::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_READ_B96 class methods ---
-
-    Inst_DS__DS_READ_B96::Inst_DS__DS_READ_B96(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_b96")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_B96
-
-    Inst_DS__DS_READ_B96::~Inst_DS__DS_READ_B96()
-    {
-    } // ~Inst_DS__DS_READ_B96
-
-    // --- description from .arch file ---
-    // Tri-dword read.
-    void
-    Inst_DS__DS_READ_B96::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ_B96::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemRead<3>(gpuDynInst, offset);
-    }
-
-    void
-    Inst_DS__DS_READ_B96::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
-        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4];
-                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 1];
-                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 2];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-        vdst2.write();
-    }
-    // --- Inst_DS__DS_READ_B128 class methods ---
-
-    Inst_DS__DS_READ_B128::Inst_DS__DS_READ_B128(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_b128")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_B128
-
-    Inst_DS__DS_READ_B128::~Inst_DS__DS_READ_B128()
-    {
-    } // ~Inst_DS__DS_READ_B128
-
-    // --- description from .arch file ---
-    // Qword read.
-    void
-    Inst_DS__DS_READ_B128::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ_B128::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemRead<4>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ_B128::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
-        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
-        VecOperandU32 vdst3(gpuDynInst, extData.VDST + 3);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4];
-                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 1];
-                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 2];
-                vdst3[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 3];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-        vdst2.write();
-        vdst3.write();
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_X class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_X
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_X(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_x")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_X
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::~Inst_MUBUF__BUFFER_LOAD_FORMAT_X()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_X
-
-    // --- description from .arch file ---
-    // Untyped buffer load 1 dword with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_XY class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_XY(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_XY
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::~Inst_MUBUF__BUFFER_LOAD_FORMAT_XY()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XY
-
-    // --- description from .arch file ---
-    // Untyped buffer load 2 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ
-
-    // --- description from .arch file ---
-    // Untyped buffer load 3 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW
-
-    // --- description from .arch file ---
-    // Untyped buffer load 4 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_X class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_X
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_X(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_x")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_X
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_X::~Inst_MUBUF__BUFFER_STORE_FORMAT_X()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_X
-
-    // --- description from .arch file ---
-    // Untyped buffer store 1 dword with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_XY class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XY
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_XY(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_XY
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::~Inst_MUBUF__BUFFER_STORE_FORMAT_XY()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_XY
-
-    // --- description from .arch file ---
-    // Untyped buffer store 2 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ
-
-    // --- description from .arch file ---
-    // Untyped buffer store 3 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
-        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
-
-    // --- description from .arch file ---
-    // Untyped buffer store 4 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_d16_x")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
-        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
-
-    // --- description from .arch file ---
-    // Untyped buffer load 1 dword with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_d16_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
-        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
-
-    // --- description from .arch file ---
-    // Untyped buffer load 2 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_d16_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
-        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
-
-    // --- description from .arch file ---
-    // Untyped buffer load 3 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_d16_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
-        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
-
-    // --- description from .arch file ---
-    // Untyped buffer load 4 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_d16_x")
-    {
-        setFlag(Store);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
-        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
-
-    // --- description from .arch file ---
-    // Untyped buffer store 1 dword with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_d16_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
-        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
-
-    // --- description from .arch file ---
-    // Untyped buffer store 2 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_d16_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
-        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
-
-    // --- description from .arch file ---
-    // Untyped buffer store 3 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_d16_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
-        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
-
-    // --- description from .arch file ---
-    // Untyped buffer store 4 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_UBYTE class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_UBYTE
-        ::Inst_MUBUF__BUFFER_LOAD_UBYTE(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_ubyte")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_LOAD_UBYTE
-
-    Inst_MUBUF__BUFFER_LOAD_UBYTE::~Inst_MUBUF__BUFFER_LOAD_UBYTE()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_UBYTE
-
-    // --- description from .arch file ---
-    // Untyped buffer load unsigned byte (zero extend to VGPR destination).
-    void
-    Inst_MUBUF__BUFFER_LOAD_UBYTE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-
-        rsrcDesc.read();
-        offset.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_UBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<VecElemU8>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_UBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDATA);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                if (!oobMask[lane]) {
-                    vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU8*>(
-                        gpuDynInst->d_data))[lane]);
-                } else {
-                    vdst[lane] = 0;
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-
-    // --- Inst_MUBUF__BUFFER_LOAD_SBYTE class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_SBYTE
-        ::Inst_MUBUF__BUFFER_LOAD_SBYTE(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_sbyte")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_SBYTE
-
-    Inst_MUBUF__BUFFER_LOAD_SBYTE::~Inst_MUBUF__BUFFER_LOAD_SBYTE()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_SBYTE
-
-    // --- description from .arch file ---
-    // Untyped buffer load signed byte (sign extend to VGPR destination).
-    void
-    Inst_MUBUF__BUFFER_LOAD_SBYTE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_SBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_SBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_USHORT class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_USHORT
-        ::Inst_MUBUF__BUFFER_LOAD_USHORT(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_ushort")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_LOAD_USHORT
-
-    Inst_MUBUF__BUFFER_LOAD_USHORT::~Inst_MUBUF__BUFFER_LOAD_USHORT()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_USHORT
-
-    // --- description from .arch file ---
-    // Untyped buffer load unsigned short (zero extend to VGPR destination).
-    void
-    Inst_MUBUF__BUFFER_LOAD_USHORT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-
-        rsrcDesc.read();
-        offset.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_USHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<VecElemU16>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_USHORT::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDATA);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                if (!oobMask[lane]) {
-                    vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU16*>(
-                        gpuDynInst->d_data))[lane]);
-                } else {
-                    vdst[lane] = 0;
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-
-    // --- Inst_MUBUF__BUFFER_LOAD_SSHORT class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_SSHORT
-        ::Inst_MUBUF__BUFFER_LOAD_SSHORT(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_sshort")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_SSHORT
-
-    Inst_MUBUF__BUFFER_LOAD_SSHORT::~Inst_MUBUF__BUFFER_LOAD_SSHORT()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_SSHORT
-
-    // --- description from .arch file ---
-    // Untyped buffer load signed short (sign extend to VGPR destination).
-    void
-    Inst_MUBUF__BUFFER_LOAD_SSHORT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_SSHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_SSHORT::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_DWORD class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_DWORD
-        ::Inst_MUBUF__BUFFER_LOAD_DWORD(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_LOAD_DWORD
-
-    Inst_MUBUF__BUFFER_LOAD_DWORD::~Inst_MUBUF__BUFFER_LOAD_DWORD()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_DWORD
-
-    // --- description from .arch file ---
-    // Untyped buffer load dword.
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-
-        rsrcDesc.read();
-        offset.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDATA);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                if (!oobMask[lane]) {
-                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane];
-                } else {
-                    vdst[lane] = 0;
-                }
-            }
-        }
-
-        vdst.write();
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_LOAD_DWORDX2 class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_DWORDX2
-        ::Inst_MUBUF__BUFFER_LOAD_DWORDX2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_LOAD_DWORDX2
-
-    Inst_MUBUF__BUFFER_LOAD_DWORDX2::~Inst_MUBUF__BUFFER_LOAD_DWORDX2()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_DWORDX2
-
-    // --- description from .arch file ---
-    // Untyped buffer load 2 dwords.
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-
-        rsrcDesc.read();
-        offset.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<2>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                if (!oobMask[lane]) {
-                    vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 2];
-                    vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 2 + 1];
-                } else {
-                    vdst0[lane] = 0;
-                    vdst1[lane] = 0;
-                }
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_LOAD_DWORDX3 class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_DWORDX3
-        ::Inst_MUBUF__BUFFER_LOAD_DWORDX3(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_dwordx3")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_LOAD_DWORDX3
-
-    Inst_MUBUF__BUFFER_LOAD_DWORDX3::~Inst_MUBUF__BUFFER_LOAD_DWORDX3()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_DWORDX3
-
-    // --- description from .arch file ---
-    // Untyped buffer load 3 dwords.
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-
-        rsrcDesc.read();
-        offset.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<3>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
-        VecOperandU32 vdst2(gpuDynInst, extData.VDATA + 2);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                if (!oobMask[lane]) {
-                    vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 3];
-                    vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 3 + 1];
-                    vdst2[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 3 + 2];
-                } else {
-                    vdst0[lane] = 0;
-                    vdst1[lane] = 0;
-                    vdst2[lane] = 0;
-                }
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-        vdst2.write();
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_LOAD_DWORDX4 class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_DWORDX4
-        ::Inst_MUBUF__BUFFER_LOAD_DWORDX4(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_LOAD_DWORDX4
-
-    Inst_MUBUF__BUFFER_LOAD_DWORDX4::~Inst_MUBUF__BUFFER_LOAD_DWORDX4()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_DWORDX4
-
-    // --- description from .arch file ---
-    // Untyped buffer load 4 dwords.
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-
-        rsrcDesc.read();
-        offset.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<4>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
-        VecOperandU32 vdst2(gpuDynInst, extData.VDATA + 2);
-        VecOperandU32 vdst3(gpuDynInst, extData.VDATA + 3);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                if (!oobMask[lane]) {
-                    vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 4];
-                    vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 4 + 1];
-                    vdst2[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 4 + 2];
-                    vdst3[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 4 + 3];
-                } else {
-                    vdst0[lane] = 0;
-                    vdst1[lane] = 0;
-                    vdst2[lane] = 0;
-                    vdst3[lane] = 0;
-                }
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-        vdst2.write();
-        vdst3.write();
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_STORE_BYTE class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_BYTE
-        ::Inst_MUBUF__BUFFER_STORE_BYTE(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_byte")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_STORE_BYTE
-
-    Inst_MUBUF__BUFFER_STORE_BYTE::~Inst_MUBUF__BUFFER_STORE_BYTE()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_BYTE
-
-    // --- description from .arch file ---
-    // Untyped buffer store byte.
-    void
-    Inst_MUBUF__BUFFER_STORE_BYTE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-        ConstVecOperandI8 data(gpuDynInst, extData.VDATA);
-
-        rsrcDesc.read();
-        offset.read();
-        data.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-       gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemI8*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_BYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemI8>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_BYTE::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_SHORT class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_SHORT
-        ::Inst_MUBUF__BUFFER_STORE_SHORT(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_short")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_STORE_SHORT
-
-    Inst_MUBUF__BUFFER_STORE_SHORT::~Inst_MUBUF__BUFFER_STORE_SHORT()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_SHORT
-
-    // --- description from .arch file ---
-    // Untyped buffer store short.
-    void
-    Inst_MUBUF__BUFFER_STORE_SHORT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-        ConstVecOperandI16 data(gpuDynInst, extData.VDATA);
-
-        rsrcDesc.read();
-        offset.read();
-        data.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemI16*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_SHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemI16>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_SHORT::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_DWORD class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_DWORD::
-        Inst_MUBUF__BUFFER_STORE_DWORD(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_STORE_DWORD
-
-    Inst_MUBUF__BUFFER_STORE_DWORD::~Inst_MUBUF__BUFFER_STORE_DWORD()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_DWORD
-
-    // --- description from .arch file ---
-    // Untyped buffer store dword.
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-        ConstVecOperandU32 data(gpuDynInst, extData.VDATA);
-
-        rsrcDesc.read();
-        offset.read();
-        data.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_STORE_DWORDX2 class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_DWORDX2
-        ::Inst_MUBUF__BUFFER_STORE_DWORDX2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_STORE_DWORDX2
-
-    Inst_MUBUF__BUFFER_STORE_DWORDX2::~Inst_MUBUF__BUFFER_STORE_DWORDX2()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_DWORDX2
-
-    // --- description from .arch file ---
-    // Untyped buffer store 2 dwords.
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-        ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
-        ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
-
-        rsrcDesc.read();
-        offset.read();
-        data0.read();
-        data1.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
-                    = data0[lane];
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
-                    = data1[lane];
-            }
-        }
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<2>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_STORE_DWORDX3 class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_DWORDX3
-        ::Inst_MUBUF__BUFFER_STORE_DWORDX3(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_dwordx3")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_STORE_DWORDX3
-
-    Inst_MUBUF__BUFFER_STORE_DWORDX3::~Inst_MUBUF__BUFFER_STORE_DWORDX3()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_DWORDX3
-
-    // --- description from .arch file ---
-    // Untyped buffer store 3 dwords.
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-        ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
-        ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
-        ConstVecOperandU32 data2(gpuDynInst, extData.VDATA + 2);
-
-        rsrcDesc.read();
-        offset.read();
-        data0.read();
-        data1.read();
-        data2.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
-                    = data0[lane];
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
-                    = data1[lane];
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 2]
-                    = data2[lane];
-            }
-        }
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<3>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_STORE_DWORDX4 class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_DWORDX4
-        ::Inst_MUBUF__BUFFER_STORE_DWORDX4(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_STORE_DWORDX4
-
-    Inst_MUBUF__BUFFER_STORE_DWORDX4::~Inst_MUBUF__BUFFER_STORE_DWORDX4()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_DWORDX4
-
-    // --- description from .arch file ---
-    // Untyped buffer store 4 dwords.
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-        ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
-        ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
-        ConstVecOperandU32 data2(gpuDynInst, extData.VDATA + 2);
-        ConstVecOperandU32 data3(gpuDynInst, extData.VDATA + 3);
-
-        rsrcDesc.read();
-        offset.read();
-        data0.read();
-        data1.read();
-        data2.read();
-        data3.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
-                    = data0[lane];
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
-                    = data1[lane];
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 2]
-                    = data2[lane];
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 3]
-                    = data3[lane];
-            }
-        }
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<4>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_STORE_LDS_DWORD class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_LDS_DWORD
-        ::Inst_MUBUF__BUFFER_STORE_LDS_DWORD(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_lds_dword")
-    {
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_LDS_DWORD
-
-    Inst_MUBUF__BUFFER_STORE_LDS_DWORD::~Inst_MUBUF__BUFFER_STORE_LDS_DWORD()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_LDS_DWORD
-
-    // --- description from .arch file ---
-    // Store one DWORD from LDS memory to system memory without utilizing
-    // VGPRs.
-    void
-    Inst_MUBUF__BUFFER_STORE_LDS_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_WBINVL1 class methods ---
-
-    Inst_MUBUF__BUFFER_WBINVL1::Inst_MUBUF__BUFFER_WBINVL1(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_wbinvl1")
-    {
-        setFlag(MemoryRef);
-        setFlag(GPUStaticInst::MemSync);
-        setFlag(GlobalSegment);
-        setFlag(MemSync);
-    } // Inst_MUBUF__BUFFER_WBINVL1
-
-    Inst_MUBUF__BUFFER_WBINVL1::~Inst_MUBUF__BUFFER_WBINVL1()
-    {
-    } // ~Inst_MUBUF__BUFFER_WBINVL1
-
-    // --- description from .arch file ---
-    // Write back and invalidate the shader L1.
-    // Always returns ACK to shader.
-    void
-    Inst_MUBUF__BUFFER_WBINVL1::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-            gpuDynInst->computeUnit()->globalMemoryPipe.
-                issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_WBINVL1::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        // TODO: Fix it for gfx10. Once we have the new gfx10 cache model, we
-        // need to precisely communicate the writeback-invalidate operation to
-        // the new gfx10 coalescer rather than sending AcquireRelease markers.
-        // The SICoalescer would need to be updated appropriately as well.
-        injectGlobalMemFence(gpuDynInst);
-    } // initiateAcc
-    void
-    Inst_MUBUF__BUFFER_WBINVL1::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_WBINVL1_VOL class methods ---
-
-    Inst_MUBUF__BUFFER_WBINVL1_VOL
-        ::Inst_MUBUF__BUFFER_WBINVL1_VOL(InFmt_MUBUF*iFmt)
-        : Inst_MUBUF(iFmt, "buffer_wbinvl1_vol") {
-        // This instruction is same as buffer_wbinvl1 instruction except this
-        // instruction only invalidate L1 shader line with MTYPE SC and GC.
-        // Since Hermes L1 (TCP) do not differentiate between its cache lines,
-        // this instruction currently behaves (and implemented ) exactly like
-        // buffer_wbinvl1 instruction.
-        setFlag(MemoryRef);
-        setFlag(GPUStaticInst::MemSync);
-        setFlag(GlobalSegment);
-        setFlag(MemSync);
-    } // Inst_MUBUF__BUFFER_WBINVL1_VOL
-
-    Inst_MUBUF__BUFFER_WBINVL1_VOL::~Inst_MUBUF__BUFFER_WBINVL1_VOL()
-    {
-    } // ~Inst_MUBUF__BUFFER_WBINVL1_VOL
-
-    // --- description from .arch file ---
-    // Write back and invalidate the shader L1 only for lines that are marked
-    // ---  volatile.
-    // Always returns ACK to shader.
-    void
-    Inst_MUBUF__BUFFER_WBINVL1_VOL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-            gpuDynInst->computeUnit()->globalMemoryPipe.
-                issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    } // execute
-    void
-    Inst_MUBUF__BUFFER_WBINVL1_VOL::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        injectGlobalMemFence(gpuDynInst);
-    } // initiateAcc
-    void
-    Inst_MUBUF__BUFFER_WBINVL1_VOL::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_ATOMIC_SWAP class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_SWAP
-        ::Inst_MUBUF__BUFFER_ATOMIC_SWAP(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_swap")
-    {
-        setFlag(AtomicExch);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SWAP
-
-    Inst_MUBUF__BUFFER_ATOMIC_SWAP::~Inst_MUBUF__BUFFER_ATOMIC_SWAP()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SWAP
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SWAP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP
-        ::Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_cmpswap")
-    {
-        setFlag(AtomicCAS);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP
-
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // src = DATA[0];
-    // cmp = DATA[1];
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-        ConstVecOperandU32 src(gpuDynInst, extData.VDATA);
-        ConstVecOperandU32 cmp(gpuDynInst, extData.VDATA + 1);
-
-        rsrcDesc.read();
-        offset.read();
-        src.read();
-        cmp.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->x_data))[lane]
-                    = src[lane];
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
-                    = cmp[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        if (isAtomicRet()) {
-            VecOperandU32 vdst(gpuDynInst, extData.VDATA);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane];
-                }
-            }
-
-            vdst.write();
-        }
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_ATOMIC_ADD class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_ADD
-        ::Inst_MUBUF__BUFFER_ATOMIC_ADD(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_add")
-    {
-        setFlag(AtomicAdd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_ADD
-
-    Inst_MUBUF__BUFFER_ATOMIC_ADD::~Inst_MUBUF__BUFFER_ATOMIC_ADD()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_ADD
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_ADD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_SUB class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_SUB
-        ::Inst_MUBUF__BUFFER_ATOMIC_SUB(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_sub")
-    {
-        setFlag(AtomicSub);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SUB
-
-    Inst_MUBUF__BUFFER_ATOMIC_SUB::~Inst_MUBUF__BUFFER_ATOMIC_SUB()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SUB
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_SMIN class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMIN
-        ::Inst_MUBUF__BUFFER_ATOMIC_SMIN(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_smin")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SMIN
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMIN::~Inst_MUBUF__BUFFER_ATOMIC_SMIN()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMIN
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SMIN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_UMIN class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMIN
-        ::Inst_MUBUF__BUFFER_ATOMIC_UMIN(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_umin")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_UMIN
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMIN::~Inst_MUBUF__BUFFER_ATOMIC_UMIN()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMIN
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_UMIN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_SMAX class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMAX
-        ::Inst_MUBUF__BUFFER_ATOMIC_SMAX(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_smax")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SMAX
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMAX::~Inst_MUBUF__BUFFER_ATOMIC_SMAX()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMAX
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SMAX::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_UMAX class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMAX
-        ::Inst_MUBUF__BUFFER_ATOMIC_UMAX(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_umax")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_UMAX
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMAX::~Inst_MUBUF__BUFFER_ATOMIC_UMAX()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMAX
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_UMAX::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_AND class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_AND
-        ::Inst_MUBUF__BUFFER_ATOMIC_AND(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_and")
-    {
-        setFlag(AtomicAnd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_AND
-
-    Inst_MUBUF__BUFFER_ATOMIC_AND::~Inst_MUBUF__BUFFER_ATOMIC_AND()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_AND
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_AND::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_OR class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_OR
-        ::Inst_MUBUF__BUFFER_ATOMIC_OR(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_or")
-    {
-        setFlag(AtomicOr);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_OR
-
-    Inst_MUBUF__BUFFER_ATOMIC_OR::~Inst_MUBUF__BUFFER_ATOMIC_OR()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_OR
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_OR::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_XOR class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_XOR
-        ::Inst_MUBUF__BUFFER_ATOMIC_XOR(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_xor")
-    {
-        setFlag(AtomicXor);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_XOR
-
-    Inst_MUBUF__BUFFER_ATOMIC_XOR::~Inst_MUBUF__BUFFER_ATOMIC_XOR()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_XOR
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_XOR::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_INC class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_INC
-        ::Inst_MUBUF__BUFFER_ATOMIC_INC(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_inc")
-    {
-        setFlag(AtomicInc);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_INC
-
-    Inst_MUBUF__BUFFER_ATOMIC_INC::~Inst_MUBUF__BUFFER_ATOMIC_INC()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_INC
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_DEC class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_DEC
-        ::Inst_MUBUF__BUFFER_ATOMIC_DEC(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_dec")
-    {
-        setFlag(AtomicDec);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_DEC
-
-    Inst_MUBUF__BUFFER_ATOMIC_DEC::~Inst_MUBUF__BUFFER_ATOMIC_DEC()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_DEC
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
-    // (unsigned compare); RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_swap_x2")
-    {
-        setFlag(AtomicExch);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2::~Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_cmpswap_x2")
-    {
-        setFlag(AtomicCAS);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
-        ::~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // src = DATA[0:1];
-    // cmp = DATA[2:3];
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_ADD_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_ADD_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_ADD_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_add_x2")
-    {
-        setFlag(AtomicAdd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_ADD_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_ADD_X2::~Inst_MUBUF__BUFFER_ATOMIC_ADD_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_ADD_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_ADD_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_SUB_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_SUB_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_SUB_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_sub_x2")
-    {
-        setFlag(AtomicSub);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SUB_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_SUB_X2::~Inst_MUBUF__BUFFER_ATOMIC_SUB_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SUB_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SUB_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_smin_x2")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2::~Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_umin_x2")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2::~Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_smax_x2")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2::~Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_umax_x2")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2::~Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_AND_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_AND_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_AND_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_and_x2")
-    {
-        setFlag(AtomicAnd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_AND_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_AND_X2::~Inst_MUBUF__BUFFER_ATOMIC_AND_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_AND_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_AND_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_OR_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_OR_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_OR_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_or_x2")
-    {
-        setFlag(AtomicOr);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-    } // Inst_MUBUF__BUFFER_ATOMIC_OR_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_OR_X2::~Inst_MUBUF__BUFFER_ATOMIC_OR_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_OR_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_OR_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_XOR_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_XOR_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_XOR_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_xor_x2")
-    {
-        setFlag(AtomicXor);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_XOR_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_XOR_X2::~Inst_MUBUF__BUFFER_ATOMIC_XOR_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_XOR_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_XOR_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_INC_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_INC_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_INC_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_inc_x2")
-    {
-        setFlag(AtomicInc);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_INC_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_INC_X2::~Inst_MUBUF__BUFFER_ATOMIC_INC_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_INC_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_INC_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_DEC_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_DEC_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_DEC_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_dec_x2")
-    {
-        setFlag(AtomicDec);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_DEC_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_DEC_X2::~Inst_MUBUF__BUFFER_ATOMIC_DEC_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_DEC_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
-    // (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_DEC_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_X class methods ---
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_X(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_x")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_X()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
-
-    // --- description from .arch file ---
-    // Typed buffer load 1 dword with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY class methods ---
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
-
-    // --- description from .arch file ---
-    // Typed buffer load 2 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ class methods ---
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
-
-    // --- description from .arch file ---
-    // Typed buffer load 3 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW class methods ---
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
-        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
-
-    // --- description from .arch file ---
-    // Typed buffer load 4 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_X class methods ---
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_X
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_X(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_x")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_X
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::~Inst_MTBUF__TBUFFER_STORE_FORMAT_X()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_X
-
-    // --- description from .arch file ---
-    // Typed buffer store 1 dword with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_XY class methods ---
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XY(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XY()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
-
-    // --- description from .arch file ---
-    // Typed buffer store 2 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ class methods ---
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
-        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
-
-    // --- description from .arch file ---
-    // Typed buffer store 3 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW class methods ---
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
-        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
-
-    // --- description from .arch file ---
-    // Typed buffer store 4 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X class methods ---
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_x")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::
-        ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
-
-    // --- description from .arch file ---
-    // Typed buffer load 1 dword with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::initiateAcc(
-          GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY class methods ---
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
-        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
-
-    // --- description from .arch file ---
-    // Typed buffer load 2 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ class methods ---
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ(
-          InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
-        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
-
-    // --- description from .arch file ---
-    // Typed buffer load 3 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW class methods ---
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW(
-          InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
-        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
-
-    // --- description from .arch file ---
-    // Typed buffer load 4 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X class methods ---
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_x")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
-        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
-
-    // --- description from .arch file ---
-    // Typed buffer store 1 dword with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY class methods ---
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
-        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
-
-    // --- description from .arch file ---
-    // Typed buffer store 2 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ class methods ---
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
-        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
-
-    // --- description from .arch file ---
-    // Typed buffer store 3 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::initiateAcc(
-          GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW class methods ---
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
-        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
-
-    // --- description from .arch file ---
-    // Typed buffer store 4 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::execute(
-        GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_LOAD class methods ---
-
-    Inst_MIMG__IMAGE_LOAD::Inst_MIMG__IMAGE_LOAD(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_load")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_LOAD
-
-    Inst_MIMG__IMAGE_LOAD::~Inst_MIMG__IMAGE_LOAD()
-    {
-    } // ~Inst_MIMG__IMAGE_LOAD
-
-    // --- description from .arch file ---
-    // Image memory load with format conversion specified in T#. No sampler.
-    void
-    Inst_MIMG__IMAGE_LOAD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_LOAD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_LOAD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_LOAD_MIP class methods ---
-
-    Inst_MIMG__IMAGE_LOAD_MIP::Inst_MIMG__IMAGE_LOAD_MIP(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_load_mip")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_LOAD_MIP
-
-    Inst_MIMG__IMAGE_LOAD_MIP::~Inst_MIMG__IMAGE_LOAD_MIP()
-    {
-    } // ~Inst_MIMG__IMAGE_LOAD_MIP
-
-    // --- description from .arch file ---
-    // Image memory load with user-supplied mip level. No sampler.
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_LOAD_PCK class methods ---
-
-    Inst_MIMG__IMAGE_LOAD_PCK::Inst_MIMG__IMAGE_LOAD_PCK(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_load_pck")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_LOAD_PCK
-
-    Inst_MIMG__IMAGE_LOAD_PCK::~Inst_MIMG__IMAGE_LOAD_PCK()
-    {
-    } // ~Inst_MIMG__IMAGE_LOAD_PCK
-
-    // --- description from .arch file ---
-    // Image memory load with no format conversion. No sampler.
-    void
-    Inst_MIMG__IMAGE_LOAD_PCK::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_LOAD_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_LOAD_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_LOAD_PCK_SGN class methods ---
-
-    Inst_MIMG__IMAGE_LOAD_PCK_SGN::Inst_MIMG__IMAGE_LOAD_PCK_SGN(
-        InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_load_pck_sgn")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_LOAD_PCK_SGN
-
-    Inst_MIMG__IMAGE_LOAD_PCK_SGN::~Inst_MIMG__IMAGE_LOAD_PCK_SGN()
-    {
-    } // ~Inst_MIMG__IMAGE_LOAD_PCK_SGN
-
-    // --- description from .arch file ---
-    // Image memory load with with no format conversion and sign extension. No
-    // ---  sampler.
-    void
-    Inst_MIMG__IMAGE_LOAD_PCK_SGN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_LOAD_PCK_SGN::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_LOAD_PCK_SGN::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_LOAD_MIP_PCK class methods ---
-
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK::Inst_MIMG__IMAGE_LOAD_MIP_PCK(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_load_mip_pck")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_LOAD_MIP_PCK
-
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK::~Inst_MIMG__IMAGE_LOAD_MIP_PCK()
-    {
-    } // ~Inst_MIMG__IMAGE_LOAD_MIP_PCK
-
-    // --- description from .arch file ---
-    // Image memory load with user-supplied mip level, no format conversion. No
-    // ---  sampler.
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN class methods ---
-
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN(
-        InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_load_mip_pck_sgn")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN
-
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::~Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN()
-    {
-    } // ~Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN
-
-    // --- description from .arch file ---
-    // Image memory load with user-supplied mip level, no format conversion and
-    // ---  with sign extension. No sampler.
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_STORE class methods ---
-
-    Inst_MIMG__IMAGE_STORE::Inst_MIMG__IMAGE_STORE(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_store")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_STORE
-
-    Inst_MIMG__IMAGE_STORE::~Inst_MIMG__IMAGE_STORE()
-    {
-    } // ~Inst_MIMG__IMAGE_STORE
-
-    // --- description from .arch file ---
-    // Image memory store with format conversion specified in T#. No sampler.
-    void
-    Inst_MIMG__IMAGE_STORE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_STORE::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_STORE::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_STORE_MIP class methods ---
-
-    Inst_MIMG__IMAGE_STORE_MIP::Inst_MIMG__IMAGE_STORE_MIP(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_store_mip")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_STORE_MIP
-
-    Inst_MIMG__IMAGE_STORE_MIP::~Inst_MIMG__IMAGE_STORE_MIP()
-    {
-    } // ~Inst_MIMG__IMAGE_STORE_MIP
-
-    // --- description from .arch file ---
-    // Image memory store with format conversion specified in T# to user
-    // specified mip level. No sampler.
-    void
-    Inst_MIMG__IMAGE_STORE_MIP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_STORE_MIP::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_STORE_MIP::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_STORE_PCK class methods ---
-
-    Inst_MIMG__IMAGE_STORE_PCK::Inst_MIMG__IMAGE_STORE_PCK(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_store_pck")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_STORE_PCK
-
-    Inst_MIMG__IMAGE_STORE_PCK::~Inst_MIMG__IMAGE_STORE_PCK()
-    {
-    } // ~Inst_MIMG__IMAGE_STORE_PCK
-
-    // --- description from .arch file ---
-    // Image memory store of packed data without format conversion. No sampler.
-    void
-    Inst_MIMG__IMAGE_STORE_PCK::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_STORE_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_STORE_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_STORE_MIP_PCK class methods ---
-
-    Inst_MIMG__IMAGE_STORE_MIP_PCK::Inst_MIMG__IMAGE_STORE_MIP_PCK(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_store_mip_pck")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_STORE_MIP_PCK
-
-    Inst_MIMG__IMAGE_STORE_MIP_PCK::~Inst_MIMG__IMAGE_STORE_MIP_PCK()
-    {
-    } // ~Inst_MIMG__IMAGE_STORE_MIP_PCK
-
-    // --- description from .arch file ---
-    // Image memory store of packed data without format conversion to
-    // user-supplied mip level. No sampler.
-    void
-    Inst_MIMG__IMAGE_STORE_MIP_PCK::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_STORE_MIP_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_STORE_MIP_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_GET_RESINFO class methods ---
-
-    Inst_MIMG__IMAGE_GET_RESINFO::Inst_MIMG__IMAGE_GET_RESINFO(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_get_resinfo")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GET_RESINFO
-
-    Inst_MIMG__IMAGE_GET_RESINFO::~Inst_MIMG__IMAGE_GET_RESINFO()
-    {
-    } // ~Inst_MIMG__IMAGE_GET_RESINFO
-
-    // --- description from .arch file ---
-    // return resource info for a given mip level specified in the address
-    // vgpr. No sampler. Returns 4 integer values into VGPRs 3-0:
-    // {num_mip_levels, depth, height, width}.
-    void
-    Inst_MIMG__IMAGE_GET_RESINFO::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_SWAP class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_SWAP::Inst_MIMG__IMAGE_ATOMIC_SWAP(
-        InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_swap")
-    {
-        setFlag(AtomicExch);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_SWAP
-
-    Inst_MIMG__IMAGE_ATOMIC_SWAP::~Inst_MIMG__IMAGE_ATOMIC_SWAP()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_SWAP
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_SWAP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_CMPSWAP class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_CMPSWAP::Inst_MIMG__IMAGE_ATOMIC_CMPSWAP(
-        InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_cmpswap")
-    {
-        setFlag(AtomicCAS);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_CMPSWAP
-
-    Inst_MIMG__IMAGE_ATOMIC_CMPSWAP::~Inst_MIMG__IMAGE_ATOMIC_CMPSWAP()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_CMPSWAP
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // src = DATA[0];
-    // cmp = DATA[1];
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_CMPSWAP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_ADD class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_ADD::Inst_MIMG__IMAGE_ATOMIC_ADD(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_add")
-    {
-        setFlag(AtomicAdd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_ADD
-
-    Inst_MIMG__IMAGE_ATOMIC_ADD::~Inst_MIMG__IMAGE_ATOMIC_ADD()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_ADD
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_ADD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_SUB class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_SUB::Inst_MIMG__IMAGE_ATOMIC_SUB(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_sub")
-    {
-        setFlag(AtomicSub);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_SUB
-
-    Inst_MIMG__IMAGE_ATOMIC_SUB::~Inst_MIMG__IMAGE_ATOMIC_SUB()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_SUB
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_SMIN class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_SMIN::Inst_MIMG__IMAGE_ATOMIC_SMIN(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_smin")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_SMIN
-
-    Inst_MIMG__IMAGE_ATOMIC_SMIN::~Inst_MIMG__IMAGE_ATOMIC_SMIN()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_SMIN
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_SMIN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_UMIN class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_UMIN::Inst_MIMG__IMAGE_ATOMIC_UMIN(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_umin")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_UMIN
-
-    Inst_MIMG__IMAGE_ATOMIC_UMIN::~Inst_MIMG__IMAGE_ATOMIC_UMIN()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_UMIN
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_UMIN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_SMAX class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_SMAX::Inst_MIMG__IMAGE_ATOMIC_SMAX(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_smax")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_SMAX
-
-    Inst_MIMG__IMAGE_ATOMIC_SMAX::~Inst_MIMG__IMAGE_ATOMIC_SMAX()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_SMAX
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_SMAX::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_UMAX class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_UMAX::Inst_MIMG__IMAGE_ATOMIC_UMAX(
-        InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_umax")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_UMAX
-
-    Inst_MIMG__IMAGE_ATOMIC_UMAX::~Inst_MIMG__IMAGE_ATOMIC_UMAX()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_UMAX
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_UMAX::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_AND class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_AND::Inst_MIMG__IMAGE_ATOMIC_AND(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_and")
-    {
-        setFlag(AtomicAnd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_AND
-
-    Inst_MIMG__IMAGE_ATOMIC_AND::~Inst_MIMG__IMAGE_ATOMIC_AND()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_AND
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_AND::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_OR class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_OR::Inst_MIMG__IMAGE_ATOMIC_OR(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_or")
-    {
-        setFlag(AtomicOr);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_OR
-
-    Inst_MIMG__IMAGE_ATOMIC_OR::~Inst_MIMG__IMAGE_ATOMIC_OR()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_OR
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_OR::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_XOR class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_XOR::Inst_MIMG__IMAGE_ATOMIC_XOR(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_xor")
-    {
-        setFlag(AtomicXor);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_XOR
-
-    Inst_MIMG__IMAGE_ATOMIC_XOR::~Inst_MIMG__IMAGE_ATOMIC_XOR()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_XOR
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_XOR::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_INC class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_INC::Inst_MIMG__IMAGE_ATOMIC_INC(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_inc")
-    {
-        setFlag(AtomicInc);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_INC
-
-    Inst_MIMG__IMAGE_ATOMIC_INC::~Inst_MIMG__IMAGE_ATOMIC_INC()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_INC
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_DEC class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_DEC::Inst_MIMG__IMAGE_ATOMIC_DEC(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_dec")
-    {
-        setFlag(AtomicDec);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_DEC
-
-    Inst_MIMG__IMAGE_ATOMIC_DEC::~Inst_MIMG__IMAGE_ATOMIC_DEC()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_DEC
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
-    // (unsigned compare); RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE::Inst_MIMG__IMAGE_SAMPLE(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample")
-    {
-    } // Inst_MIMG__IMAGE_SAMPLE
-
-    Inst_MIMG__IMAGE_SAMPLE::~Inst_MIMG__IMAGE_SAMPLE()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE
-
-    // --- description from .arch file ---
-    // sample texture map.
-    void
-    Inst_MIMG__IMAGE_SAMPLE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_CL class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_CL::Inst_MIMG__IMAGE_SAMPLE_CL(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_CL::~Inst_MIMG__IMAGE_SAMPLE_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_CL
-
-    // --- description from .arch file ---
-    // sample texture map, with LOD clamp specified in shader.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_D class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_D::Inst_MIMG__IMAGE_SAMPLE_D(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_d")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_D
-
-    Inst_MIMG__IMAGE_SAMPLE_D::~Inst_MIMG__IMAGE_SAMPLE_D()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_D
-
-    // --- description from .arch file ---
-    // sample texture map, with user derivatives
-    void
-    Inst_MIMG__IMAGE_SAMPLE_D::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_D_CL class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_D_CL::Inst_MIMG__IMAGE_SAMPLE_D_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_d_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_D_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_D_CL::~Inst_MIMG__IMAGE_SAMPLE_D_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_D_CL
-
-    // --- description from .arch file ---
-    // sample texture map, with LOD clamp specified in shader, with user
-    // ---  derivatives.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_D_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_L class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_L::Inst_MIMG__IMAGE_SAMPLE_L(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_l")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_L
-
-    Inst_MIMG__IMAGE_SAMPLE_L::~Inst_MIMG__IMAGE_SAMPLE_L()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_L
-
-    // --- description from .arch file ---
-    // sample texture map, with user LOD.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_L::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_B class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_B::Inst_MIMG__IMAGE_SAMPLE_B(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_b")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_B
-
-    Inst_MIMG__IMAGE_SAMPLE_B::~Inst_MIMG__IMAGE_SAMPLE_B()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_B
-
-    // --- description from .arch file ---
-    // sample texture map, with lod bias.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_B::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_B_CL class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_B_CL::Inst_MIMG__IMAGE_SAMPLE_B_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_b_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_B_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_B_CL::~Inst_MIMG__IMAGE_SAMPLE_B_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_B_CL
-
-    // --- description from .arch file ---
-    // sample texture map, with LOD clamp specified in shader, with lod bias.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_B_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_LZ class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_LZ::Inst_MIMG__IMAGE_SAMPLE_LZ(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_lz")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_LZ
-
-    Inst_MIMG__IMAGE_SAMPLE_LZ::~Inst_MIMG__IMAGE_SAMPLE_LZ()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_LZ
-
-    // --- description from .arch file ---
-    // sample texture map, from level 0.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_LZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C::Inst_MIMG__IMAGE_SAMPLE_C(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C
-
-    Inst_MIMG__IMAGE_SAMPLE_C::~Inst_MIMG__IMAGE_SAMPLE_C()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C
-
-    // --- description from .arch file ---
-    // sample texture map, with PCF.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_CL class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CL::Inst_MIMG__IMAGE_SAMPLE_C_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CL::~Inst_MIMG__IMAGE_SAMPLE_C_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CL
-
-    // --- description from .arch file ---
-    // SAMPLE_C, with LOD clamp specified in shader.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_D class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D::Inst_MIMG__IMAGE_SAMPLE_C_D(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_d")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_D
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D::~Inst_MIMG__IMAGE_SAMPLE_C_D()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D
-
-    // --- description from .arch file ---
-    // SAMPLE_C, with user derivatives.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_D::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_D_CL class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D_CL::Inst_MIMG__IMAGE_SAMPLE_C_D_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_d_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_D_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D_CL::~Inst_MIMG__IMAGE_SAMPLE_C_D_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D_CL
-
-    // --- description from .arch file ---
-    // SAMPLE_C, with LOD clamp specified in shader, with user derivatives.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_D_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_L class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_L::Inst_MIMG__IMAGE_SAMPLE_C_L(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_l")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_L
-
-    Inst_MIMG__IMAGE_SAMPLE_C_L::~Inst_MIMG__IMAGE_SAMPLE_C_L()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_L
-
-    // --- description from .arch file ---
-    // SAMPLE_C, with user LOD.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_L::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_B class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B::Inst_MIMG__IMAGE_SAMPLE_C_B(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_b")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_B
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B::~Inst_MIMG__IMAGE_SAMPLE_C_B()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B
-
-    // --- description from .arch file ---
-    // SAMPLE_C, with lod bias.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_B::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_B_CL class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B_CL::Inst_MIMG__IMAGE_SAMPLE_C_B_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_b_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_B_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B_CL::~Inst_MIMG__IMAGE_SAMPLE_C_B_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B_CL
-
-    // --- description from .arch file ---
-    // SAMPLE_C, with LOD clamp specified in shader, with lod bias.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_B_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_LZ class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_LZ::Inst_MIMG__IMAGE_SAMPLE_C_LZ(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_lz")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_LZ
-
-    Inst_MIMG__IMAGE_SAMPLE_C_LZ::~Inst_MIMG__IMAGE_SAMPLE_C_LZ()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_LZ
-
-    // --- description from .arch file ---
-    // SAMPLE_C, from level 0.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_LZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_O::Inst_MIMG__IMAGE_SAMPLE_O(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_O
-
-    Inst_MIMG__IMAGE_SAMPLE_O::~Inst_MIMG__IMAGE_SAMPLE_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_O
-
-    // --- description from .arch file ---
-    // sample texture map, with user offsets.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_CL_O::Inst_MIMG__IMAGE_SAMPLE_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_CL_O::~Inst_MIMG__IMAGE_SAMPLE_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_CL_O
-
-    // --- description from .arch file ---
-    // SAMPLE_O with LOD clamp specified in shader.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_D_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_D_O::Inst_MIMG__IMAGE_SAMPLE_D_O(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_d_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_D_O
-
-    Inst_MIMG__IMAGE_SAMPLE_D_O::~Inst_MIMG__IMAGE_SAMPLE_D_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_D_O
-
-    // --- description from .arch file ---
-    // SAMPLE_O, with user derivatives.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_D_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_D_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_D_CL_O::Inst_MIMG__IMAGE_SAMPLE_D_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_d_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_D_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_D_CL_O::~Inst_MIMG__IMAGE_SAMPLE_D_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_D_CL_O
-
-    // --- description from .arch file ---
-    // SAMPLE_O, with LOD clamp specified in shader, with user derivatives.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_D_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_L_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_L_O::Inst_MIMG__IMAGE_SAMPLE_L_O(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_l_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_L_O
-
-    Inst_MIMG__IMAGE_SAMPLE_L_O::~Inst_MIMG__IMAGE_SAMPLE_L_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_L_O
-
-    // --- description from .arch file ---
-    // SAMPLE_O, with user LOD.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_L_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_B_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_B_O::Inst_MIMG__IMAGE_SAMPLE_B_O(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_b_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_B_O
-
-    Inst_MIMG__IMAGE_SAMPLE_B_O::~Inst_MIMG__IMAGE_SAMPLE_B_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_B_O
-
-    // --- description from .arch file ---
-    // SAMPLE_O, with lod bias.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_B_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_B_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_B_CL_O::Inst_MIMG__IMAGE_SAMPLE_B_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_b_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_B_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_B_CL_O::~Inst_MIMG__IMAGE_SAMPLE_B_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_B_CL_O
-
-    // --- description from .arch file ---
-    // SAMPLE_O, with LOD clamp specified in shader, with lod bias.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_LZ_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_LZ_O::Inst_MIMG__IMAGE_SAMPLE_LZ_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_lz_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_LZ_O
-
-    Inst_MIMG__IMAGE_SAMPLE_LZ_O::~Inst_MIMG__IMAGE_SAMPLE_LZ_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_LZ_O
-
-    // --- description from .arch file ---
-    // SAMPLE_O, from level 0.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_LZ_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_O::Inst_MIMG__IMAGE_SAMPLE_C_O(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_O::~Inst_MIMG__IMAGE_SAMPLE_C_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C with user specified offsets.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CL_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C_O, with LOD clamp specified in shader.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_D_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D_O::Inst_MIMG__IMAGE_SAMPLE_C_D_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_d_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_D_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D_O::~Inst_MIMG__IMAGE_SAMPLE_C_D_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C_O, with user derivatives.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_D_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_d_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C_O, with LOD clamp specified in shader, with user derivatives.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_L_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_L_O::Inst_MIMG__IMAGE_SAMPLE_C_L_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_l_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_L_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_L_O::~Inst_MIMG__IMAGE_SAMPLE_C_L_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_L_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C_O, with user LOD.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_L_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_B_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B_O::Inst_MIMG__IMAGE_SAMPLE_C_B_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_b_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_B_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B_O::~Inst_MIMG__IMAGE_SAMPLE_C_B_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C_O, with lod bias.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_B_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_b_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C_O, with LOD clamp specified in shader, with lod bias.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_LZ_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_LZ_O::Inst_MIMG__IMAGE_SAMPLE_C_LZ_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_lz_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_LZ_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_LZ_O::~Inst_MIMG__IMAGE_SAMPLE_C_LZ_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_LZ_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C_O, from level 0.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_LZ_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4 class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4::Inst_MIMG__IMAGE_GATHER4(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4
-
-    Inst_MIMG__IMAGE_GATHER4::~Inst_MIMG__IMAGE_GATHER4()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2).
-    void
-    Inst_MIMG__IMAGE_GATHER4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_CL class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_CL::Inst_MIMG__IMAGE_GATHER4_CL(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_CL
-
-    Inst_MIMG__IMAGE_GATHER4_CL::~Inst_MIMG__IMAGE_GATHER4_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_CL
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) with user LOD clamp.
-    void
-    Inst_MIMG__IMAGE_GATHER4_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_L class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_L::Inst_MIMG__IMAGE_GATHER4_L(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_l")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_L
-
-    Inst_MIMG__IMAGE_GATHER4_L::~Inst_MIMG__IMAGE_GATHER4_L()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_L
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) with user LOD.
-    void
-    Inst_MIMG__IMAGE_GATHER4_L::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_B class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_B::Inst_MIMG__IMAGE_GATHER4_B(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_b")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_B
-
-    Inst_MIMG__IMAGE_GATHER4_B::~Inst_MIMG__IMAGE_GATHER4_B()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_B
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) with user bias.
-    void
-    Inst_MIMG__IMAGE_GATHER4_B::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_B_CL class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_B_CL::Inst_MIMG__IMAGE_GATHER4_B_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_b_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_B_CL
-
-    Inst_MIMG__IMAGE_GATHER4_B_CL::~Inst_MIMG__IMAGE_GATHER4_B_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_B_CL
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) with user bias and clamp.
-    void
-    Inst_MIMG__IMAGE_GATHER4_B_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_LZ class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_LZ::Inst_MIMG__IMAGE_GATHER4_LZ(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_lz")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_LZ
-
-    Inst_MIMG__IMAGE_GATHER4_LZ::~Inst_MIMG__IMAGE_GATHER4_LZ()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_LZ
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) at level 0.
-    void
-    Inst_MIMG__IMAGE_GATHER4_LZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C::Inst_MIMG__IMAGE_GATHER4_C(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C
-
-    Inst_MIMG__IMAGE_GATHER4_C::~Inst_MIMG__IMAGE_GATHER4_C()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) with PCF.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_CL class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_CL::Inst_MIMG__IMAGE_GATHER4_C_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_CL
-
-    Inst_MIMG__IMAGE_GATHER4_C_CL::~Inst_MIMG__IMAGE_GATHER4_C_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_CL
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) with user LOD clamp and PCF.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_L class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_L::Inst_MIMG__IMAGE_GATHER4_C_L(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_l")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_L
-
-    Inst_MIMG__IMAGE_GATHER4_C_L::~Inst_MIMG__IMAGE_GATHER4_C_L()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_L
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) with user LOD and PCF.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_L::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_B class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_B::Inst_MIMG__IMAGE_GATHER4_C_B(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_b")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_B
-
-    Inst_MIMG__IMAGE_GATHER4_C_B::~Inst_MIMG__IMAGE_GATHER4_C_B()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_B
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) with user bias and PCF.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_B::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_B_CL class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_B_CL::Inst_MIMG__IMAGE_GATHER4_C_B_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_b_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_B_CL
-
-    Inst_MIMG__IMAGE_GATHER4_C_B_CL::~Inst_MIMG__IMAGE_GATHER4_C_B_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_B_CL
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) with user bias, clamp and PCF.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_B_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_LZ class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_LZ::Inst_MIMG__IMAGE_GATHER4_C_LZ(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_lz")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_LZ
-
-    Inst_MIMG__IMAGE_GATHER4_C_LZ::~Inst_MIMG__IMAGE_GATHER4_C_LZ()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_LZ
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) at level 0, with PCF.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_LZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_O::Inst_MIMG__IMAGE_GATHER4_O(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_O
-
-    Inst_MIMG__IMAGE_GATHER4_O::~Inst_MIMG__IMAGE_GATHER4_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_O
-
-    // --- description from .arch file ---
-    // GATHER4, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_CL_O::Inst_MIMG__IMAGE_GATHER4_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_CL_O
-
-    Inst_MIMG__IMAGE_GATHER4_CL_O::~Inst_MIMG__IMAGE_GATHER4_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_CL_O
-
-    // --- description from .arch file ---
-    // GATHER4_CL, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_L_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_L_O::Inst_MIMG__IMAGE_GATHER4_L_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_l_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_L_O
-
-    Inst_MIMG__IMAGE_GATHER4_L_O::~Inst_MIMG__IMAGE_GATHER4_L_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_L_O
-
-    // --- description from .arch file ---
-    // GATHER4_L, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_L_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_B_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_B_O::Inst_MIMG__IMAGE_GATHER4_B_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_b_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_B_O
-
-    Inst_MIMG__IMAGE_GATHER4_B_O::~Inst_MIMG__IMAGE_GATHER4_B_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_B_O
-
-    // --- description from .arch file ---
-    // GATHER4_B, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_B_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_B_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_B_CL_O::Inst_MIMG__IMAGE_GATHER4_B_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_b_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_B_CL_O
-
-    Inst_MIMG__IMAGE_GATHER4_B_CL_O::~Inst_MIMG__IMAGE_GATHER4_B_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_B_CL_O
-
-    // --- description from .arch file ---
-    // GATHER4_B_CL, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_LZ_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_LZ_O::Inst_MIMG__IMAGE_GATHER4_LZ_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_lz_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_LZ_O
-
-    Inst_MIMG__IMAGE_GATHER4_LZ_O::~Inst_MIMG__IMAGE_GATHER4_LZ_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_LZ_O
-
-    // --- description from .arch file ---
-    // GATHER4_LZ, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_LZ_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_O::Inst_MIMG__IMAGE_GATHER4_C_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_O
-
-    Inst_MIMG__IMAGE_GATHER4_C_O::~Inst_MIMG__IMAGE_GATHER4_C_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_O
-
-    // --- description from .arch file ---
-    // GATHER4_C, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_CL_O::Inst_MIMG__IMAGE_GATHER4_C_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_CL_O
-
-    Inst_MIMG__IMAGE_GATHER4_C_CL_O::~Inst_MIMG__IMAGE_GATHER4_C_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_CL_O
-
-    // --- description from .arch file ---
-    // GATHER4_C_CL, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_L_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_L_O::Inst_MIMG__IMAGE_GATHER4_C_L_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_l_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_L_O
-
-    Inst_MIMG__IMAGE_GATHER4_C_L_O::~Inst_MIMG__IMAGE_GATHER4_C_L_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_L_O
-
-    // --- description from .arch file ---
-    // GATHER4_C_L, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_L_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_B_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_B_O::Inst_MIMG__IMAGE_GATHER4_C_B_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_b_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_B_O
-
-    Inst_MIMG__IMAGE_GATHER4_C_B_O::~Inst_MIMG__IMAGE_GATHER4_C_B_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_B_O
-
-    // --- description from .arch file ---
-    // GATHER4_B, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_B_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_B_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_B_CL_O::Inst_MIMG__IMAGE_GATHER4_C_B_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_b_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_B_CL_O
-
-    Inst_MIMG__IMAGE_GATHER4_C_B_CL_O::~Inst_MIMG__IMAGE_GATHER4_C_B_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_B_CL_O
-
-    // --- description from .arch file ---
-    // GATHER4_B_CL, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_LZ_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_LZ_O::Inst_MIMG__IMAGE_GATHER4_C_LZ_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_lz_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_LZ_O
-
-    Inst_MIMG__IMAGE_GATHER4_C_LZ_O::~Inst_MIMG__IMAGE_GATHER4_C_LZ_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_LZ_O
-
-    // --- description from .arch file ---
-    // GATHER4_C_LZ, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_LZ_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GET_LOD class methods ---
-
-    Inst_MIMG__IMAGE_GET_LOD::Inst_MIMG__IMAGE_GET_LOD(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_get_lod")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GET_LOD
-
-    Inst_MIMG__IMAGE_GET_LOD::~Inst_MIMG__IMAGE_GET_LOD()
-    {
-    } // ~Inst_MIMG__IMAGE_GET_LOD
-
-    // --- description from .arch file ---
-    // Return calculated LOD. Vdata gets 2 32bit integer values: { rawLOD,
-    // ---  clampedLOD }.
-    void
-    Inst_MIMG__IMAGE_GET_LOD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_CD class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_CD::Inst_MIMG__IMAGE_SAMPLE_CD(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_cd")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_CD
-
-    Inst_MIMG__IMAGE_SAMPLE_CD::~Inst_MIMG__IMAGE_SAMPLE_CD()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_CD
-
-    // --- description from .arch file ---
-    // sample texture map, with user derivatives (LOD per quad)
-    void
-    Inst_MIMG__IMAGE_SAMPLE_CD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_CD_CL class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_CD_CL::Inst_MIMG__IMAGE_SAMPLE_CD_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_cd_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_CD_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_CD_CL::~Inst_MIMG__IMAGE_SAMPLE_CD_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_CD_CL
-
-    // --- description from .arch file ---
-    // sample texture map, with LOD clamp specified in shader, with user
-    // ---  derivatives (LOD per quad).
-    void
-    Inst_MIMG__IMAGE_SAMPLE_CD_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_CD class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD::Inst_MIMG__IMAGE_SAMPLE_C_CD(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_cd")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_CD
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD::~Inst_MIMG__IMAGE_SAMPLE_C_CD()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD
-
-    // --- description from .arch file ---
-    // SAMPLE_C, with user derivatives (LOD per quad).
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_CD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_CD_CL class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL::Inst_MIMG__IMAGE_SAMPLE_C_CD_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_cd_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_CD_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL::~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL
-
-    // --- description from .arch file ---
-    // SAMPLE_C, with LOD clamp specified in shader, with user derivatives
-    // (LOD per quad).
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_CD_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_CD_O::Inst_MIMG__IMAGE_SAMPLE_CD_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_cd_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_CD_O
-
-    Inst_MIMG__IMAGE_SAMPLE_CD_O::~Inst_MIMG__IMAGE_SAMPLE_CD_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_CD_O
-
-    // --- description from .arch file ---
-    // SAMPLE_O, with user derivatives (LOD per quad).
-    void
-    Inst_MIMG__IMAGE_SAMPLE_CD_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_CD_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_CD_CL_O::Inst_MIMG__IMAGE_SAMPLE_CD_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_cd_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_CD_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_CD_CL_O::~Inst_MIMG__IMAGE_SAMPLE_CD_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_CD_CL_O
-
-    // --- description from .arch file ---
-    // SAMPLE_O, with LOD clamp specified in shader, with user derivatives
-    // (LOD per quad).
-    void
-    Inst_MIMG__IMAGE_SAMPLE_CD_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_CD_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_O::Inst_MIMG__IMAGE_SAMPLE_C_CD_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_cd_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_CD_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_O::~Inst_MIMG__IMAGE_SAMPLE_C_CD_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C_O, with user derivatives (LOD per quad).
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_cd_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C_O, with LOD clamp specified in shader, with user derivatives
-    // (LOD per quad).
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_EXP__EXP class methods ---
-
-    Inst_EXP__EXP::Inst_EXP__EXP(InFmt_EXP *iFmt)
-        : Inst_EXP(iFmt, "exp")
-    {
-    } // Inst_EXP__EXP
-
-    Inst_EXP__EXP::~Inst_EXP__EXP()
-    {
-    } // ~Inst_EXP__EXP
-
-    // --- description from .arch file ---
-    // Export through SX.
-    void
-    Inst_EXP__EXP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_FLAT__FLAT_LOAD_UBYTE class methods ---
-
-    Inst_FLAT__FLAT_LOAD_UBYTE::Inst_FLAT__FLAT_LOAD_UBYTE(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_ubyte")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_UBYTE
-
-    Inst_FLAT__FLAT_LOAD_UBYTE::~Inst_FLAT__FLAT_LOAD_UBYTE()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_UBYTE
-
-    // --- description from .arch file ---
-    // Untyped buffer load unsigned byte (zero extend to VGPR destination).
-    void
-    Inst_FLAT__FLAT_LOAD_UBYTE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_LOAD_UBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<VecElemU8>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_UBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU8*>(
-                    gpuDynInst->d_data))[lane]);
-            }
-        }
-        vdst.write();
-    } // execute
-    // --- Inst_FLAT__FLAT_LOAD_SBYTE class methods ---
-
-    Inst_FLAT__FLAT_LOAD_SBYTE::Inst_FLAT__FLAT_LOAD_SBYTE(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_sbyte")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_SBYTE
-
-    Inst_FLAT__FLAT_LOAD_SBYTE::~Inst_FLAT__FLAT_LOAD_SBYTE()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_SBYTE
-
-    // --- description from .arch file ---
-    // Untyped buffer load signed byte (sign extend to VGPR destination).
-    void
-    Inst_FLAT__FLAT_LOAD_SBYTE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_LOAD_SBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_SBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_FLAT__FLAT_LOAD_USHORT class methods ---
-
-    Inst_FLAT__FLAT_LOAD_USHORT::Inst_FLAT__FLAT_LOAD_USHORT(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_ushort")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_USHORT
-
-    Inst_FLAT__FLAT_LOAD_USHORT::~Inst_FLAT__FLAT_LOAD_USHORT()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_USHORT
-
-    // --- description from .arch file ---
-    // Untyped buffer load unsigned short (zero extend to VGPR destination).
-    void
-    Inst_FLAT__FLAT_LOAD_USHORT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_LOAD_USHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<VecElemU16>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_USHORT::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU16*>(
-                    gpuDynInst->d_data))[lane]);
-            }
-        }
-        vdst.write();
-    } // execute
-
-    // --- Inst_FLAT__FLAT_LOAD_SSHORT class methods ---
-
-    Inst_FLAT__FLAT_LOAD_SSHORT::Inst_FLAT__FLAT_LOAD_SSHORT(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_sshort")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_SSHORT
-
-    Inst_FLAT__FLAT_LOAD_SSHORT::~Inst_FLAT__FLAT_LOAD_SSHORT()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_SSHORT
-
-    // --- description from .arch file ---
-    // Untyped buffer load signed short (sign extend to VGPR destination).
-    void
-    Inst_FLAT__FLAT_LOAD_SSHORT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_LOAD_SSHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_SSHORT::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_FLAT__FLAT_LOAD_DWORD class methods ---
-
-    Inst_FLAT__FLAT_LOAD_DWORD::Inst_FLAT__FLAT_LOAD_DWORD(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_DWORD
-
-    Inst_FLAT__FLAT_LOAD_DWORD::~Inst_FLAT__FLAT_LOAD_DWORD()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_DWORD
-
-    // --- description from .arch file ---
-    // Untyped buffer load dword.
-    void
-    Inst_FLAT__FLAT_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane];
-            }
-        }
-        vdst.write();
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_LOAD_DWORDX2 class methods ---
-
-    Inst_FLAT__FLAT_LOAD_DWORDX2::Inst_FLAT__FLAT_LOAD_DWORDX2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_DWORDX2
-
-    Inst_FLAT__FLAT_LOAD_DWORDX2::~Inst_FLAT__FLAT_LOAD_DWORDX2()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_DWORDX2
-
-    // --- description from .arch file ---
-    // Untyped buffer load 2 dwords.
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU64 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane];
-            }
-        }
-        vdst.write();
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_LOAD_DWORDX3 class methods ---
-
-    Inst_FLAT__FLAT_LOAD_DWORDX3::Inst_FLAT__FLAT_LOAD_DWORDX3(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_dwordx3")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_DWORDX3
-
-    Inst_FLAT__FLAT_LOAD_DWORDX3::~Inst_FLAT__FLAT_LOAD_DWORDX3()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_DWORDX3
-
-    // --- description from .arch file ---
-    // Untyped buffer load 3 dwords.
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<3>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
-        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 3];
-                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 3 + 1];
-                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 3 + 2];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-        vdst2.write();
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_LOAD_DWORDX4 class methods ---
-
-    Inst_FLAT__FLAT_LOAD_DWORDX4::Inst_FLAT__FLAT_LOAD_DWORDX4(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_DWORDX4
-
-    Inst_FLAT__FLAT_LOAD_DWORDX4::~Inst_FLAT__FLAT_LOAD_DWORDX4()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_DWORDX4
-
-    // --- description from .arch file ---
-    // Untyped buffer load 4 dwords.
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<4>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
-        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
-        VecOperandU32 vdst3(gpuDynInst, extData.VDST + 3);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4];
-                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 1];
-                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 2];
-                vdst3[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 3];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-        vdst2.write();
-        vdst3.write();
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_STORE_BYTE class methods ---
-
-    Inst_FLAT__FLAT_STORE_BYTE::Inst_FLAT__FLAT_STORE_BYTE(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_store_byte")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_FLAT__FLAT_STORE_BYTE
-
-    Inst_FLAT__FLAT_STORE_BYTE::~Inst_FLAT__FLAT_STORE_BYTE()
-    {
-    } // ~Inst_FLAT__FLAT_STORE_BYTE
-
-    // --- description from .arch file ---
-    // Untyped buffer store byte.
-    void
-    Inst_FLAT__FLAT_STORE_BYTE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU8 data(gpuDynInst, extData.DATA);
-
-        data.read();
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU8*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_STORE_BYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemU8>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_STORE_BYTE::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_FLAT__FLAT_STORE_SHORT class methods ---
-
-    Inst_FLAT__FLAT_STORE_SHORT::Inst_FLAT__FLAT_STORE_SHORT(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_store_short")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_FLAT__FLAT_STORE_SHORT
-
-    Inst_FLAT__FLAT_STORE_SHORT::~Inst_FLAT__FLAT_STORE_SHORT()
-    {
-    } // ~Inst_FLAT__FLAT_STORE_SHORT
-
-    // --- description from .arch file ---
-    // Untyped buffer store short.
-    void
-    Inst_FLAT__FLAT_STORE_SHORT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU16 data(gpuDynInst, extData.DATA);
-
-        data.read();
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU16*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_STORE_SHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemU16>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_STORE_SHORT::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_STORE_SHORT_D16_HI class methods ---
-
-    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::
-        Inst_FLAT__FLAT_STORE_SHORT_D16_HI(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_store_short_d16_hi")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_FLAT__FLAT_STORE_SHORT_D16_HI
-
-    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::~Inst_FLAT__FLAT_STORE_SHORT_D16_HI()
-    {
-    } // ~Inst_FLAT__FLAT_STORE_SHORT_D16_HI
-
-    // --- description from .arch file ---
-    // Untyped buffer store short.
-    void
-    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA);
-
-        data.read();
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU16*>(gpuDynInst->d_data))[lane]
-                    = (data[lane] >> 16);
-            }
-        }
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemU16>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_STORE_DWORD class methods ---
-
-    Inst_FLAT__FLAT_STORE_DWORD::Inst_FLAT__FLAT_STORE_DWORD(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_store_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_FLAT__FLAT_STORE_DWORD
-
-    Inst_FLAT__FLAT_STORE_DWORD::~Inst_FLAT__FLAT_STORE_DWORD()
-    {
-    } // ~Inst_FLAT__FLAT_STORE_DWORD
-
-    // --- description from .arch file ---
-    // Untyped buffer store dword.
-    void
-    Inst_FLAT__FLAT_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA);
-
-        data.read();
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_STORE_DWORDX2 class methods ---
-
-    Inst_FLAT__FLAT_STORE_DWORDX2::Inst_FLAT__FLAT_STORE_DWORDX2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_store_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_FLAT__FLAT_STORE_DWORDX2
-
-    Inst_FLAT__FLAT_STORE_DWORDX2::~Inst_FLAT__FLAT_STORE_DWORDX2()
-    {
-    } // ~Inst_FLAT__FLAT_STORE_DWORDX2
-
-    // --- description from .arch file ---
-    // Untyped buffer store 2 dwords.
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU64 data(gpuDynInst, extData.DATA);
-
-        data.read();
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU64*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_STORE_DWORDX3 class methods ---
-
-    Inst_FLAT__FLAT_STORE_DWORDX3::Inst_FLAT__FLAT_STORE_DWORDX3(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_store_dwordx3")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_FLAT__FLAT_STORE_DWORDX3
-
-    Inst_FLAT__FLAT_STORE_DWORDX3::~Inst_FLAT__FLAT_STORE_DWORDX3()
-    {
-    } // ~Inst_FLAT__FLAT_STORE_DWORDX3
-
-    // --- description from .arch file ---
-    // Untyped buffer store 3 dwords.
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 data0(gpuDynInst, extData.DATA);
-        ConstVecOperandU32 data1(gpuDynInst, extData.DATA + 1);
-        ConstVecOperandU32 data2(gpuDynInst, extData.DATA + 2);
-
-        data0.read();
-        data1.read();
-        data2.read();
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 3] = data0[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 3 + 1] = data1[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 3 + 2] = data2[lane];
-            }
-        }
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<3>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_STORE_DWORDX4 class methods ---
-
-    Inst_FLAT__FLAT_STORE_DWORDX4::Inst_FLAT__FLAT_STORE_DWORDX4(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_store_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_FLAT__FLAT_STORE_DWORDX4
-
-    Inst_FLAT__FLAT_STORE_DWORDX4::~Inst_FLAT__FLAT_STORE_DWORDX4()
-    {
-    } // ~Inst_FLAT__FLAT_STORE_DWORDX4
-
-    // --- description from .arch file ---
-    // Untyped buffer store 4 dwords.
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 data0(gpuDynInst, extData.DATA);
-        ConstVecOperandU32 data1(gpuDynInst, extData.DATA + 1);
-        ConstVecOperandU32 data2(gpuDynInst, extData.DATA + 2);
-        ConstVecOperandU32 data3(gpuDynInst, extData.DATA + 3);
-
-        data0.read();
-        data1.read();
-        data2.read();
-        data3.read();
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4] = data0[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 3] = data3[lane];
-            }
-        }
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<4>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_SWAP class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_SWAP::Inst_FLAT__FLAT_ATOMIC_SWAP(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_swap")
-    {
-        setFlag(AtomicExch);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SWAP
-
-    Inst_FLAT__FLAT_ATOMIC_SWAP::~Inst_FLAT__FLAT_ATOMIC_SWAP()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SWAP
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SWAP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SWAP::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SWAP::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
-    } // completeAcc
-
-    // --- Inst_FLAT__FLAT_ATOMIC_CMPSWAP class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP
-        ::Inst_FLAT__FLAT_ATOMIC_CMPSWAP(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_cmpswap")
-    {
-        setFlag(AtomicCAS);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_CMPSWAP
-
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::~Inst_FLAT__FLAT_ATOMIC_CMPSWAP()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_CMPSWAP
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // src = DATA[0];
-    // cmp = DATA[1];
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU32, VecElemU32, 1>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_ADD class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_ADD::Inst_FLAT__FLAT_ATOMIC_ADD(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_add")
-    {
-        setFlag(AtomicAdd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_ADD
-
-    Inst_FLAT__FLAT_ATOMIC_ADD::~Inst_FLAT__FLAT_ATOMIC_ADD()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_ADD
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_SUB class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_SUB::Inst_FLAT__FLAT_ATOMIC_SUB(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_sub")
-    {
-        setFlag(AtomicSub);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SUB
-
-    Inst_FLAT__FLAT_ATOMIC_SUB::~Inst_FLAT__FLAT_ATOMIC_SUB()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SUB
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SUB::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SUB::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_SMIN class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_SMIN::Inst_FLAT__FLAT_ATOMIC_SMIN(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_smin")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SMIN
-
-    Inst_FLAT__FLAT_ATOMIC_SMIN::~Inst_FLAT__FLAT_ATOMIC_SMIN()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SMIN
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMIN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandI32, VecElemI32>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMIN::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemI32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMIN::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandI32, VecElemI32>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_UMIN class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_UMIN::Inst_FLAT__FLAT_ATOMIC_UMIN(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_umin")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_UMIN
-
-    Inst_FLAT__FLAT_ATOMIC_UMIN::~Inst_FLAT__FLAT_ATOMIC_UMIN()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_UMIN
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMIN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMIN::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMIN::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_SMAX class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_SMAX::Inst_FLAT__FLAT_ATOMIC_SMAX(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_smax")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SMAX
-
-    Inst_FLAT__FLAT_ATOMIC_SMAX::~Inst_FLAT__FLAT_ATOMIC_SMAX()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SMAX
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMAX::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandI32, VecElemI32>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMAX::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemI32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMAX::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandI32, VecElemI32>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_UMAX class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_UMAX::Inst_FLAT__FLAT_ATOMIC_UMAX(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_umax")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_UMAX
-
-    Inst_FLAT__FLAT_ATOMIC_UMAX::~Inst_FLAT__FLAT_ATOMIC_UMAX()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_UMAX
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMAX::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMAX::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMAX::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_AND class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_AND::Inst_FLAT__FLAT_ATOMIC_AND(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_and")
-    {
-        setFlag(AtomicAnd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_AND
-
-    Inst_FLAT__FLAT_ATOMIC_AND::~Inst_FLAT__FLAT_ATOMIC_AND()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_AND
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_AND::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_AND::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_AND::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_OR class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_OR::Inst_FLAT__FLAT_ATOMIC_OR(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_or")
-    {
-        setFlag(AtomicOr);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_OR
-
-    Inst_FLAT__FLAT_ATOMIC_OR::~Inst_FLAT__FLAT_ATOMIC_OR()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_OR
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_OR::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_OR::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_OR::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
-    } // completeAcc
-
-    // --- Inst_FLAT__FLAT_ATOMIC_XOR class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_XOR::Inst_FLAT__FLAT_ATOMIC_XOR(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_xor")
-    {
-        setFlag(AtomicXor);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_XOR
-
-    Inst_FLAT__FLAT_ATOMIC_XOR::~Inst_FLAT__FLAT_ATOMIC_XOR()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_XOR
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_XOR::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_XOR::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_XOR::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_INC class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_INC::Inst_FLAT__FLAT_ATOMIC_INC(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_inc")
-    {
-        setFlag(AtomicInc);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_INC
-
-    Inst_FLAT__FLAT_ATOMIC_INC::~Inst_FLAT__FLAT_ATOMIC_INC()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_INC
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_INC::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_INC::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_DEC class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_DEC::Inst_FLAT__FLAT_ATOMIC_DEC(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_dec")
-    {
-        setFlag(AtomicDec);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_DEC
-
-    Inst_FLAT__FLAT_ATOMIC_DEC::~Inst_FLAT__FLAT_ATOMIC_DEC()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_DEC
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
-    // (unsigned compare); RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_DEC::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_DEC::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_SWAP_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::Inst_FLAT__FLAT_ATOMIC_SWAP_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_swap_x2")
-    {
-        setFlag(AtomicExch);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SWAP_X2
-
-    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::~Inst_FLAT__FLAT_ATOMIC_SWAP_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SWAP_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_cmpswap_x2")
-    {
-        setFlag(AtomicCAS);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2
-
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::~Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // src = DATA[0:1];
-    // cmp = DATA[2:3];
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU64, VecElemU64, 2>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_ADD_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_ADD_X2::Inst_FLAT__FLAT_ATOMIC_ADD_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_add_x2")
-    {
-        setFlag(AtomicAdd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_ADD_X2
-
-    Inst_FLAT__FLAT_ATOMIC_ADD_X2::~Inst_FLAT__FLAT_ATOMIC_ADD_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_ADD_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_SUB_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_SUB_X2::Inst_FLAT__FLAT_ATOMIC_SUB_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_sub_x2")
-    {
-        setFlag(AtomicSub);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SUB_X2
-
-    Inst_FLAT__FLAT_ATOMIC_SUB_X2::~Inst_FLAT__FLAT_ATOMIC_SUB_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SUB_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SUB_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SUB_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SUB_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_SMIN_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::Inst_FLAT__FLAT_ATOMIC_SMIN_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_smin_x2")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SMIN_X2
-
-    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::~Inst_FLAT__FLAT_ATOMIC_SMIN_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SMIN_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandI64, VecElemI64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemI64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandI64, VecElemI64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_UMIN_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::Inst_FLAT__FLAT_ATOMIC_UMIN_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_umin_x2")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_UMIN_X2
-
-    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::~Inst_FLAT__FLAT_ATOMIC_UMIN_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_UMIN_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_SMAX_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::Inst_FLAT__FLAT_ATOMIC_SMAX_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_smax_x2")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SMAX_X2
-
-    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::~Inst_FLAT__FLAT_ATOMIC_SMAX_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SMAX_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandI64, VecElemI64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemI64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandI64, VecElemI64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_UMAX_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::Inst_FLAT__FLAT_ATOMIC_UMAX_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_umax_x2")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_UMAX_X2
-
-    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::~Inst_FLAT__FLAT_ATOMIC_UMAX_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_UMAX_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_AND_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_AND_X2::Inst_FLAT__FLAT_ATOMIC_AND_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_and_x2")
-    {
-        setFlag(AtomicAnd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_AND_X2
-
-    Inst_FLAT__FLAT_ATOMIC_AND_X2::~Inst_FLAT__FLAT_ATOMIC_AND_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_AND_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_AND_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_AND_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_AND_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_OR_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_OR_X2::Inst_FLAT__FLAT_ATOMIC_OR_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_or_x2")
-    {
-        setFlag(AtomicOr);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_OR_X2
-
-    Inst_FLAT__FLAT_ATOMIC_OR_X2::~Inst_FLAT__FLAT_ATOMIC_OR_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_OR_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_OR_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_OR_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_OR_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_XOR_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_XOR_X2::Inst_FLAT__FLAT_ATOMIC_XOR_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_xor_x2")
-    {
-        setFlag(AtomicXor);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_XOR_X2
-
-    Inst_FLAT__FLAT_ATOMIC_XOR_X2::~Inst_FLAT__FLAT_ATOMIC_XOR_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_XOR_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_XOR_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_XOR_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_XOR_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_INC_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_INC_X2::Inst_FLAT__FLAT_ATOMIC_INC_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_inc_x2")
-    {
-        setFlag(AtomicInc);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_INC_X2
-
-    Inst_FLAT__FLAT_ATOMIC_INC_X2::~Inst_FLAT__FLAT_ATOMIC_INC_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_INC_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_INC_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_INC_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_INC_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_DEC_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_DEC_X2::Inst_FLAT__FLAT_ATOMIC_DEC_X2(
-        InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_dec_x2")
-    {
-        setFlag(AtomicDec);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_DEC_X2
-
-    Inst_FLAT__FLAT_ATOMIC_DEC_X2::~Inst_FLAT__FLAT_ATOMIC_DEC_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_DEC_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
-    // (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_DEC_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_DEC_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_DEC_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_ADD_F32 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_ADD_F32::Inst_FLAT__FLAT_ATOMIC_ADD_F32(
-        InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_add_f32")
-    {
-        setFlag(AtomicAdd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_ADD_F32
-
-    Inst_FLAT__FLAT_ATOMIC_ADD_F32::~Inst_FLAT__FLAT_ATOMIC_ADD_F32()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_ADD_F32
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandF32, VecElemF32>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD_F32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemF32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD_F32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandF32, VecElemF32>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16::Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16(
-        InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_pk_add_f16")
-    {
-        setFlag(AtomicAdd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16
-
-    Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16::~Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_ADD_F64 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_ADD_F64::Inst_FLAT__FLAT_ATOMIC_ADD_F64(
-        InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_add_f64")
-    {
-        setFlag(AtomicAdd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_ADD_F64
-
-    Inst_FLAT__FLAT_ATOMIC_ADD_F64::~Inst_FLAT__FLAT_ATOMIC_ADD_F64()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_ADD_F64
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandF64, VecElemF64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD_F64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemF64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD_F64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandF64, VecElemF64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_MIN_F64 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_MIN_F64::Inst_FLAT__FLAT_ATOMIC_MIN_F64(
-        InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_min_f64")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_MIN_F64
-
-    Inst_FLAT__FLAT_ATOMIC_MIN_F64::~Inst_FLAT__FLAT_ATOMIC_MIN_F64()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_MIN_F64
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_MIN_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandF64, VecElemF64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_MIN_F64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemF64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_MIN_F64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandF64, VecElemF64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_MAX_F64 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_MAX_F64::Inst_FLAT__FLAT_ATOMIC_MAX_F64(
-        InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_max_f64")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_MAX_F64
-
-    Inst_FLAT__FLAT_ATOMIC_MAX_F64::~Inst_FLAT__FLAT_ATOMIC_MAX_F64()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_MAX_F64
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_MAX_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandF64, VecElemF64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_MAX_F64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemF64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_MAX_F64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandF64, VecElemF64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_VOP3P__V_PK_FMA_F32 class methods ---
-
-    Inst_VOP3P__V_PK_FMA_F32::Inst_VOP3P__V_PK_FMA_F32(InFmt_VOP3P *iFmt)
-        : Inst_VOP3P(iFmt, "v_pk_fma_f32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP3P__V_PK_FMA_F32
-
-    Inst_VOP3P__V_PK_FMA_F32::~Inst_VOP3P__V_PK_FMA_F32()
-    {
-    } // ~Inst_VOP3P__V_PK_FMA_F32
-
-    // D.f[63:32] = S0.f[63:32] * S1.f[63:32] + S2.f[63:32] . D.f[31:0] =
-    //     S0.f[31:0] * S1.f[31:0] + S2.f[31:0] .
-    void
-    Inst_VOP3P__V_PK_FMA_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        // This is a special case of packed instructions which operates on
-        // 64-bit inputs/outputs and not 32-bit. U64 is used here as float
-        // values cannot use bitwise operations. Consider the U64 to imply
-        // untyped 64-bits of data.
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
-        VecOperandU64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        int opsel = instData.OPSEL;
-        int opsel_hi = extData.OPSEL_HI | (instData.OPSEL_HI2 << 2);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                uint32_t s0l = (opsel & 1) ? bits(src0[lane], 63, 32)
-                                           : bits(src0[lane], 31, 0);
-                uint32_t s1l = (opsel & 2) ? bits(src1[lane], 63, 32)
-                                           : bits(src1[lane], 31, 0);
-                uint32_t s2l = (opsel & 4) ? bits(src2[lane], 63, 32)
-                                           : bits(src2[lane], 31, 0);
-
-                float dword1 = std::fma(*reinterpret_cast<float*>(&s0l),
-                                        *reinterpret_cast<float*>(&s1l),
-                                        *reinterpret_cast<float*>(&s2l));
-
-                uint32_t s0h = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
-                                              : bits(src0[lane], 31, 0);
-                uint32_t s1h = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
-                                              : bits(src1[lane], 31, 0);
-                uint32_t s2h = (opsel_hi & 4) ? bits(src2[lane], 63, 32)
-                                              : bits(src2[lane], 31, 0);
-
-                float dword2 = std::fma(*reinterpret_cast<float*>(&s0h),
-                                        *reinterpret_cast<float*>(&s1h),
-                                        *reinterpret_cast<float*>(&s2h));
-
-                uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
-                uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
-
-                vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3P__V_PK_MUL_F32 class methods ---
-
-    Inst_VOP3P__V_PK_MUL_F32::Inst_VOP3P__V_PK_MUL_F32(InFmt_VOP3P *iFmt)
-        : Inst_VOP3P(iFmt, "v_pk_mul_f32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP3P__V_PK_MUL_F32
-
-    Inst_VOP3P__V_PK_MUL_F32::~Inst_VOP3P__V_PK_MUL_F32()
-    {
-    } // ~Inst_VOP3P__V_PK_MUL_F32
-
-    // D.f[63:32] = S0.f[63:32] * S1.f[63:32] . D.f[31:0] = S0.f[31:0] *
-    //              S1.f[31:0]
-    void
-    Inst_VOP3P__V_PK_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        // This is a special case of packed instructions which operates on
-        // 64-bit inputs/outputs and not 32-bit. U64 is used here as float
-        // values cannot use bitwise operations. Consider the U64 to imply
-        // untyped 64-bits of data.
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        VecOperandU64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        int opsel = instData.OPSEL;
-        int opsel_hi = extData.OPSEL_HI;
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
-                                                   : bits(src0[lane], 31, 0);
-                uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
-                                                   : bits(src1[lane], 31, 0);
-
-                float dword1 = *reinterpret_cast<float*>(&lower_dword)
-                             * *reinterpret_cast<float*>(&upper_dword);
-
-                lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
-                                             : bits(src0[lane], 31, 0);
-                upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
-                                             : bits(src1[lane], 31, 0);
-
-                float dword2 = *reinterpret_cast<float*>(&lower_dword)
-                             * *reinterpret_cast<float*>(&upper_dword);
-
-                uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
-                uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
-
-                vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3P__V_PK_ADD_F32 class methods ---
-
-    Inst_VOP3P__V_PK_ADD_F32::Inst_VOP3P__V_PK_ADD_F32(InFmt_VOP3P *iFmt)
-        : Inst_VOP3P(iFmt, "v_pk_add_f32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP3P__V_PK_ADD_F32
-
-    Inst_VOP3P__V_PK_ADD_F32::~Inst_VOP3P__V_PK_ADD_F32()
-    {
-    } // ~Inst_VOP3P__V_PK_ADD_F32
-
-    // D.f[63:32] = S0.f[63:32] + S1.f[63:32] . D.f[31:0] = S0.f[31:0] +
-    //              S1.f[31:0]
-    void
-    Inst_VOP3P__V_PK_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        // This is a special case of packed instructions which operates on
-        // 64-bit inputs/outputs and not 32-bit. U64 is used here as float
-        // values cannot use bitwise operations. Consider the U64 to imply
-        // untyped 64-bits of data.
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        VecOperandU64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        int opsel = instData.OPSEL;
-        int opsel_hi = extData.OPSEL_HI;
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
-                                                   : bits(src0[lane], 31, 0);
-                uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
-                                                   : bits(src1[lane], 31, 0);
-
-                float dword1 = *reinterpret_cast<float*>(&lower_dword)
-                             + *reinterpret_cast<float*>(&upper_dword);
-
-                lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
-                                             : bits(src0[lane], 31, 0);
-                upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
-                                             : bits(src1[lane], 31, 0);
-
-                float dword2 = *reinterpret_cast<float*>(&lower_dword)
-                             + *reinterpret_cast<float*>(&upper_dword);
-
-                uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
-                uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
-
-                vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3P__V_PK_MOV_B32 class methods ---
-
-    Inst_VOP3P__V_PK_MOV_B32::Inst_VOP3P__V_PK_MOV_B32(InFmt_VOP3P *iFmt)
-        : Inst_VOP3P(iFmt, "v_pk_mov_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP3P__V_PK_MOV_B32
-
-    Inst_VOP3P__V_PK_MOV_B32::~Inst_VOP3P__V_PK_MOV_B32()
-    {
-    } // ~Inst_VOP3P__V_PK_MOV_B32
-
-    // D.u[63:32] = S1.u[31:0]; D.u[31:0] = S0.u[31:0].
-    void
-    Inst_VOP3P__V_PK_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        // This is a special case of packed instructions which operates on
-        // 64-bit inputs/outputs and not 32-bit.
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        VecOperandU64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        // Only OPSEL[1:0] are used
-        // OPSEL[0] 0/1: Lower dest dword = lower/upper dword of src0
-
-        int opsel = instData.OPSEL;
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                // OPSEL[1] 0/1: Lower dest dword = lower/upper dword of src1
-                uint64_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
-                                                   : bits(src0[lane], 31, 0);
-                uint64_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
-                                                   : bits(src1[lane], 31, 0);
-
-                vdst[lane] = upper_dword << 32 | lower_dword;
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8 class methods ---
-
-    Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8::
-        Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8(InFmt_VOP3P_MAI *iFmt)
-        : Inst_VOP3P_MAI(iFmt, "v_mfma_i32_16x16x16i8")
-    {
-        setFlag(ALU);
-    } // Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8
-
-    Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8::
-        ~Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8()
-    {
-    } // ~Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8
-
-    // D(16x16I32) = A(16x16I8) x B(16x16I8) + C(16x16I32), 1 Blocks, 8
-    // pass, srcA/srcB 1 archVgpr, srcC/D 4 accVGPR
-    void
-    Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        int acc_offset = 0;
-        if (instData.ACC_CD) {
-            warn("ACC_CD not yet implemented\n");
-        }
-
-        // int8 size allows for 4 elements per lane. At 16x16 this means 4
-        // lanes per column (A matrix) / (B matrix). This whole matrix fits
-        // in one VGPR. The C matrix with size int32 requires 4 VGPRs.
-        // Handle the C matrix by using a delta. This is set to 1 normally to
-        // move to the next VGPR (1 dword away) and 0 if the input is a scalar
-        // reg (e.g., a constant).
-        int delta = isVectorReg(extData.SRC2) ? 1 : 0;
-
-        // VecOperandI8 will read 8 bits and sign extend, so used U32 to read
-        // as "untyped" 32-bit values.
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandI32 src2a(gpuDynInst, extData.SRC2+acc_offset);
-        ConstVecOperandI32 src2b(gpuDynInst, extData.SRC2+acc_offset+1*delta);
-        ConstVecOperandI32 src2c(gpuDynInst, extData.SRC2+acc_offset+2*delta);
-        ConstVecOperandI32 src2d(gpuDynInst, extData.SRC2+acc_offset+3*delta);
-
-        VecOperandI32 vdsta(gpuDynInst, instData.VDST+acc_offset);
-        VecOperandI32 vdstb(gpuDynInst, instData.VDST+acc_offset+1);
-        VecOperandI32 vdstc(gpuDynInst, instData.VDST+acc_offset+2);
-        VecOperandI32 vdstd(gpuDynInst, instData.VDST+acc_offset+3);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2a.readSrc();
-        src2b.readSrc();
-        src2c.readSrc();
-        src2d.readSrc();
-
-        int32_t A[16][16];
-        for (int i = 0; i < 64; ++i) {
-            // src0[0:15] contains columns 1 - 4 packed for rows 0 - 15,
-            // src0[16:31] contains columns 5 - 8 packed for rows 0 - 15,
-            // src0[32:47] contains columns 9 - 12 packed for rows 0 - 15,
-            // src0[48:63] contains columns 13 - 16 packed for rows 0 - 15,
-            int row = i % 16;
-            int start_col = (i / 16) * 4;
-
-            A[row][start_col+0] = sext<8>(bits(src0[i], 7, 0));
-            A[row][start_col+1] = sext<8>(bits(src0[i], 15, 8));
-            A[row][start_col+2] = sext<8>(bits(src0[i], 23, 16));
-            A[row][start_col+3] = sext<8>(bits(src0[i], 31, 24));
-        }
-
-        int32_t B[16][16];
-        for (int i = 0; i < 64; ++i) {
-            // src1[0:15] contains rows 1 - 4 packed for columns 0 - 15
-            // src1[16:31] contains rows 5 - 8 packed for columns 0 - 15
-            // src1[32:47] contains rows 9 - 12 packed for columns 0 - 15
-            // src1[48:63] contains rows 13 - 16 packed for columns 0 - 15
-            int start_row = (i / 16) * 4;
-            int col = i % 16;
-
-            B[start_row+0][col] = sext<8>(bits(src1[i], 7, 0));
-            B[start_row+1][col] = sext<8>(bits(src1[i], 15, 8));
-            B[start_row+2][col] = sext<8>(bits(src1[i], 23, 16));
-            B[start_row+3][col] = sext<8>(bits(src1[i], 31, 24));
-        }
-
-        int32_t result[16][16];
-
-        // Load accumulation matrix C into result
-        for (int i = 0; i < 64; ++i) {
-            // src2a contains rows 0, 4, 8, 12
-            result[(i/16)*4][(i%16)] = src2a[i];
-            // src2b contains rows 1, 5, 9, 13
-            result[(i/16)*4+1][(i%16)] = src2b[i];
-            // src2c contains rows 2, 6, 10, 14
-            result[(i/16)*4+2][(i%16)] = src2c[i];
-            // src2d contains rows 3, 7, 11, 15
-            result[(i/16)*4+3][(i%16)] = src2d[i];
-        }
-
-        // Compute new result - This is (obviously) not optimized
-        for (int i = 0; i < 16; ++i) {
-            for (int j = 0; j < 16; ++j) {
-                for (int k = 0; k < 16; ++k) {
-                    result[i][j] += A[i][k] * B[k][j];
-                }
-            }
-        }
-
-        // Put result in dest VGPRs
-        for (int i = 0; i < 64; ++i) {
-            // vdsta contains rows 0, 4, 8, 12
-            vdsta[i] = result[(i/16)*4][(i%16)];
-            // vdstb contains rows 1, 5, 9, 13
-            vdstb[i] = result[(i/16)*4+1][(i%16)];
-            // vdstc contains rows 2, 6, 10, 14
-            vdstc[i] = result[(i/16)*4+2][(i%16)];
-            // vdstd contains rows 3, 7, 11, 15
-            vdstd[i] = result[(i/16)*4+3][(i%16)];
-        }
-
-        vdsta.write();
-        vdstb.write();
-        vdstc.write();
-        vdstd.write();
-    } // execute
-    // --- Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64 class methods ---
-
-    Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64::
-        Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64(InFmt_VOP3P_MAI *iFmt)
-        : Inst_VOP3P_MAI(iFmt, "v_mfma_f64_16x16x4f64")
-    {
-        setFlag(ALU);
-    } // Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64
-
-    Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64::
-        ~Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64()
-    {
-    } // ~Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64
-
-    // D(16x16F64) = A(16x4F64) x B(4x16F64) + C(16x16F64), 1 Blocks, 8
-    // pass, srcA/srcB 2 VGPR, srcC/D 8 VGPR
-    void
-    Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        int acc_offset = 0;
-        if (instData.ACC_CD) {
-            warn("ACC_CD not yet implemented\n");
-        }
-
-        // Handling of src2 is a bit tricky. The operator[] overload cannot
-        // be used for dword count > 2, and the dword count here is 8. Usually
-        // src2 is a VGPR/AccGPR, but it might also be constant. In order to
-        // use operator[] and handle constants, check for VGPR here and set
-        // a delta for each of the pairs of src2 GPRs.
-        int delta = isVectorReg(extData.SRC2) ? 2 : 0;
-
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF64 src2a(gpuDynInst, extData.SRC2+acc_offset);
-        ConstVecOperandF64 src2b(gpuDynInst, extData.SRC2+acc_offset+1*delta);
-        ConstVecOperandF64 src2c(gpuDynInst, extData.SRC2+acc_offset+2*delta);
-        ConstVecOperandF64 src2d(gpuDynInst, extData.SRC2+acc_offset+3*delta);
-
-        VecOperandF64 vdsta(gpuDynInst, instData.VDST+acc_offset);
-        VecOperandF64 vdstb(gpuDynInst, instData.VDST+acc_offset+2);
-        VecOperandF64 vdstc(gpuDynInst, instData.VDST+acc_offset+4);
-        VecOperandF64 vdstd(gpuDynInst, instData.VDST+acc_offset+6);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2a.readSrc();
-        src2b.readSrc();
-        src2c.readSrc();
-        src2d.readSrc();
-
-        double result[16][16];
-
-        // Load src2 into result. src2 is row major
-        for (int i = 0; i < 64; ++i) {
-            // src2a contains rows 0 - 3
-            result[(i/16)][(i%16)] = src2a[i];
-            // src2b contains rows 4 - 7
-            result[(i/16)+4][(i%16)] = src2b[i];
-            // src2c contains rows 8 - 11
-            result[(i/16)+8][(i%16)] = src2c[i];
-            // src2d contains rows 12 - 15
-            result[(i/16)+12][(i%16)] = src2d[i];
-        }
-
-        // Compute new result
-        for (int i = 0; i < 16; ++i) {
-            for (int j = 0; j < 16; ++j) {
-                for (int k = 0; k < 4; ++k) {
-                    // src0 is column major, src1 is row major
-                    int lane_A = 16*k + i;
-                    int lane_B = 16*k + j;
-                    result[i][j] += src0[lane_A] * src1[lane_B];
-                }
-            }
-        }
-
-        // Put result in dest VGPRs
-        for (int i = 0; i < 64; ++i) {
-            // vdsta contains rows 0 - 3
-            vdsta[i] = result[(i/16)][(i%16)];
-            // src2b contains rows 4 - 7
-            vdstb[i] = result[(i/16)+4][(i%16)];
-            // src2c contains rows 8 - 11
-            vdstc[i] = result[(i/16)+8][(i%16)];
-            // src2d contains rows 12 - 15
-            vdstd[i] = result[(i/16)+12][(i%16)];
-        }
-
-        vdsta.write();
-        vdstb.write();
-        vdstc.write();
-        vdstd.write();
-    } // execute
-} // namespace VegaISA
-} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/mimg.cc b/src/arch/amdgpu/vega/insts/mimg.cc
new file mode 100644
index 0000000000..29a37cca1d
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/mimg.cc
@@ -0,0 +1,2047 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_MIMG__IMAGE_LOAD class methods ---
+
+    Inst_MIMG__IMAGE_LOAD::Inst_MIMG__IMAGE_LOAD(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_load")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_LOAD
+
+    Inst_MIMG__IMAGE_LOAD::~Inst_MIMG__IMAGE_LOAD()
+    {
+    } // ~Inst_MIMG__IMAGE_LOAD
+
+    // --- description from .arch file ---
+    // Image memory load with format conversion specified in T#. No sampler.
+    void
+    Inst_MIMG__IMAGE_LOAD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_LOAD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_LOAD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_LOAD_MIP class methods ---
+
+    Inst_MIMG__IMAGE_LOAD_MIP::Inst_MIMG__IMAGE_LOAD_MIP(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_load_mip")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_LOAD_MIP
+
+    Inst_MIMG__IMAGE_LOAD_MIP::~Inst_MIMG__IMAGE_LOAD_MIP()
+    {
+    } // ~Inst_MIMG__IMAGE_LOAD_MIP
+
+    // --- description from .arch file ---
+    // Image memory load with user-supplied mip level. No sampler.
+    void
+    Inst_MIMG__IMAGE_LOAD_MIP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_LOAD_MIP::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_LOAD_MIP::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_LOAD_PCK class methods ---
+
+    Inst_MIMG__IMAGE_LOAD_PCK::Inst_MIMG__IMAGE_LOAD_PCK(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_load_pck")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_LOAD_PCK
+
+    Inst_MIMG__IMAGE_LOAD_PCK::~Inst_MIMG__IMAGE_LOAD_PCK()
+    {
+    } // ~Inst_MIMG__IMAGE_LOAD_PCK
+
+    // --- description from .arch file ---
+    // Image memory load with no format conversion. No sampler.
+    void
+    Inst_MIMG__IMAGE_LOAD_PCK::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_LOAD_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_LOAD_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_LOAD_PCK_SGN class methods ---
+
+    Inst_MIMG__IMAGE_LOAD_PCK_SGN::Inst_MIMG__IMAGE_LOAD_PCK_SGN(
+        InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_load_pck_sgn")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_LOAD_PCK_SGN
+
+    Inst_MIMG__IMAGE_LOAD_PCK_SGN::~Inst_MIMG__IMAGE_LOAD_PCK_SGN()
+    {
+    } // ~Inst_MIMG__IMAGE_LOAD_PCK_SGN
+
+    // --- description from .arch file ---
+    // Image memory load with with no format conversion and sign extension. No
+    // ---  sampler.
+    void
+    Inst_MIMG__IMAGE_LOAD_PCK_SGN::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_LOAD_PCK_SGN::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_LOAD_PCK_SGN::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_LOAD_MIP_PCK class methods ---
+
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK::Inst_MIMG__IMAGE_LOAD_MIP_PCK(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_load_mip_pck")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_LOAD_MIP_PCK
+
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK::~Inst_MIMG__IMAGE_LOAD_MIP_PCK()
+    {
+    } // ~Inst_MIMG__IMAGE_LOAD_MIP_PCK
+
+    // --- description from .arch file ---
+    // Image memory load with user-supplied mip level, no format conversion. No
+    // ---  sampler.
+    void
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN class methods ---
+
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN(
+        InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_load_mip_pck_sgn")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN
+
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::~Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN()
+    {
+    } // ~Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN
+
+    // --- description from .arch file ---
+    // Image memory load with user-supplied mip level, no format conversion and
+    // ---  with sign extension. No sampler.
+    void
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_STORE class methods ---
+
+    Inst_MIMG__IMAGE_STORE::Inst_MIMG__IMAGE_STORE(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_store")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_STORE
+
+    Inst_MIMG__IMAGE_STORE::~Inst_MIMG__IMAGE_STORE()
+    {
+    } // ~Inst_MIMG__IMAGE_STORE
+
+    // --- description from .arch file ---
+    // Image memory store with format conversion specified in T#. No sampler.
+    void
+    Inst_MIMG__IMAGE_STORE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_STORE::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_STORE::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_STORE_MIP class methods ---
+
+    Inst_MIMG__IMAGE_STORE_MIP::Inst_MIMG__IMAGE_STORE_MIP(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_store_mip")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_STORE_MIP
+
+    Inst_MIMG__IMAGE_STORE_MIP::~Inst_MIMG__IMAGE_STORE_MIP()
+    {
+    } // ~Inst_MIMG__IMAGE_STORE_MIP
+
+    // --- description from .arch file ---
+    // Image memory store with format conversion specified in T# to user
+    // specified mip level. No sampler.
+    void
+    Inst_MIMG__IMAGE_STORE_MIP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_STORE_MIP::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_STORE_MIP::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_STORE_PCK class methods ---
+
+    Inst_MIMG__IMAGE_STORE_PCK::Inst_MIMG__IMAGE_STORE_PCK(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_store_pck")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_STORE_PCK
+
+    Inst_MIMG__IMAGE_STORE_PCK::~Inst_MIMG__IMAGE_STORE_PCK()
+    {
+    } // ~Inst_MIMG__IMAGE_STORE_PCK
+
+    // --- description from .arch file ---
+    // Image memory store of packed data without format conversion. No sampler.
+    void
+    Inst_MIMG__IMAGE_STORE_PCK::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_STORE_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_STORE_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_STORE_MIP_PCK class methods ---
+
+    Inst_MIMG__IMAGE_STORE_MIP_PCK::Inst_MIMG__IMAGE_STORE_MIP_PCK(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_store_mip_pck")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_STORE_MIP_PCK
+
+    Inst_MIMG__IMAGE_STORE_MIP_PCK::~Inst_MIMG__IMAGE_STORE_MIP_PCK()
+    {
+    } // ~Inst_MIMG__IMAGE_STORE_MIP_PCK
+
+    // --- description from .arch file ---
+    // Image memory store of packed data without format conversion to
+    // user-supplied mip level. No sampler.
+    void
+    Inst_MIMG__IMAGE_STORE_MIP_PCK::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_STORE_MIP_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_STORE_MIP_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_GET_RESINFO class methods ---
+
+    Inst_MIMG__IMAGE_GET_RESINFO::Inst_MIMG__IMAGE_GET_RESINFO(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_get_resinfo")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GET_RESINFO
+
+    Inst_MIMG__IMAGE_GET_RESINFO::~Inst_MIMG__IMAGE_GET_RESINFO()
+    {
+    } // ~Inst_MIMG__IMAGE_GET_RESINFO
+
+    // --- description from .arch file ---
+    // return resource info for a given mip level specified in the address
+    // vgpr. No sampler. Returns 4 integer values into VGPRs 3-0:
+    // {num_mip_levels, depth, height, width}.
+    void
+    Inst_MIMG__IMAGE_GET_RESINFO::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_SWAP class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_SWAP::Inst_MIMG__IMAGE_ATOMIC_SWAP(
+        InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_swap")
+    {
+        setFlag(AtomicExch);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_SWAP
+
+    Inst_MIMG__IMAGE_ATOMIC_SWAP::~Inst_MIMG__IMAGE_ATOMIC_SWAP()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_SWAP
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_SWAP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_CMPSWAP class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_CMPSWAP::Inst_MIMG__IMAGE_ATOMIC_CMPSWAP(
+        InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_cmpswap")
+    {
+        setFlag(AtomicCAS);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_CMPSWAP
+
+    Inst_MIMG__IMAGE_ATOMIC_CMPSWAP::~Inst_MIMG__IMAGE_ATOMIC_CMPSWAP()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_CMPSWAP
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // src = DATA[0];
+    // cmp = DATA[1];
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_CMPSWAP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_ADD class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_ADD::Inst_MIMG__IMAGE_ATOMIC_ADD(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_add")
+    {
+        setFlag(AtomicAdd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_ADD
+
+    Inst_MIMG__IMAGE_ATOMIC_ADD::~Inst_MIMG__IMAGE_ATOMIC_ADD()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_ADD
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] += DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_ADD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_SUB class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_SUB::Inst_MIMG__IMAGE_ATOMIC_SUB(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_sub")
+    {
+        setFlag(AtomicSub);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_SUB
+
+    Inst_MIMG__IMAGE_ATOMIC_SUB::~Inst_MIMG__IMAGE_ATOMIC_SUB()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_SUB
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_SMIN class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_SMIN::Inst_MIMG__IMAGE_ATOMIC_SMIN(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_smin")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_SMIN
+
+    Inst_MIMG__IMAGE_ATOMIC_SMIN::~Inst_MIMG__IMAGE_ATOMIC_SMIN()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_SMIN
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_SMIN::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_UMIN class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_UMIN::Inst_MIMG__IMAGE_ATOMIC_UMIN(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_umin")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_UMIN
+
+    Inst_MIMG__IMAGE_ATOMIC_UMIN::~Inst_MIMG__IMAGE_ATOMIC_UMIN()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_UMIN
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_UMIN::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_SMAX class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_SMAX::Inst_MIMG__IMAGE_ATOMIC_SMAX(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_smax")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_SMAX
+
+    Inst_MIMG__IMAGE_ATOMIC_SMAX::~Inst_MIMG__IMAGE_ATOMIC_SMAX()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_SMAX
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_SMAX::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_UMAX class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_UMAX::Inst_MIMG__IMAGE_ATOMIC_UMAX(
+        InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_umax")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_UMAX
+
+    Inst_MIMG__IMAGE_ATOMIC_UMAX::~Inst_MIMG__IMAGE_ATOMIC_UMAX()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_UMAX
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_UMAX::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_AND class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_AND::Inst_MIMG__IMAGE_ATOMIC_AND(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_and")
+    {
+        setFlag(AtomicAnd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_AND
+
+    Inst_MIMG__IMAGE_ATOMIC_AND::~Inst_MIMG__IMAGE_ATOMIC_AND()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_AND
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] &= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_AND::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_OR class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_OR::Inst_MIMG__IMAGE_ATOMIC_OR(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_or")
+    {
+        setFlag(AtomicOr);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_OR
+
+    Inst_MIMG__IMAGE_ATOMIC_OR::~Inst_MIMG__IMAGE_ATOMIC_OR()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_OR
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] |= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_OR::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_XOR class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_XOR::Inst_MIMG__IMAGE_ATOMIC_XOR(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_xor")
+    {
+        setFlag(AtomicXor);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_XOR
+
+    Inst_MIMG__IMAGE_ATOMIC_XOR::~Inst_MIMG__IMAGE_ATOMIC_XOR()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_XOR
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] ^= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_XOR::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_INC class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_INC::Inst_MIMG__IMAGE_ATOMIC_INC(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_inc")
+    {
+        setFlag(AtomicInc);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_INC
+
+    Inst_MIMG__IMAGE_ATOMIC_INC::~Inst_MIMG__IMAGE_ATOMIC_INC()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_INC
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_DEC class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_DEC::Inst_MIMG__IMAGE_ATOMIC_DEC(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_dec")
+    {
+        setFlag(AtomicDec);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_DEC
+
+    Inst_MIMG__IMAGE_ATOMIC_DEC::~Inst_MIMG__IMAGE_ATOMIC_DEC()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_DEC
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
+    // (unsigned compare); RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE::Inst_MIMG__IMAGE_SAMPLE(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample")
+    {
+    } // Inst_MIMG__IMAGE_SAMPLE
+
+    Inst_MIMG__IMAGE_SAMPLE::~Inst_MIMG__IMAGE_SAMPLE()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE
+
+    // --- description from .arch file ---
+    // sample texture map.
+    void
+    Inst_MIMG__IMAGE_SAMPLE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_CL class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_CL::Inst_MIMG__IMAGE_SAMPLE_CL(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_CL
+
+    Inst_MIMG__IMAGE_SAMPLE_CL::~Inst_MIMG__IMAGE_SAMPLE_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_CL
+
+    // --- description from .arch file ---
+    // sample texture map, with LOD clamp specified in shader.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_D class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_D::Inst_MIMG__IMAGE_SAMPLE_D(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_d")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_D
+
+    Inst_MIMG__IMAGE_SAMPLE_D::~Inst_MIMG__IMAGE_SAMPLE_D()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_D
+
+    // --- description from .arch file ---
+    // sample texture map, with user derivatives
+    void
+    Inst_MIMG__IMAGE_SAMPLE_D::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_D_CL class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_D_CL::Inst_MIMG__IMAGE_SAMPLE_D_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_d_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_D_CL
+
+    Inst_MIMG__IMAGE_SAMPLE_D_CL::~Inst_MIMG__IMAGE_SAMPLE_D_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_D_CL
+
+    // --- description from .arch file ---
+    // sample texture map, with LOD clamp specified in shader, with user
+    // ---  derivatives.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_D_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_L class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_L::Inst_MIMG__IMAGE_SAMPLE_L(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_l")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_L
+
+    Inst_MIMG__IMAGE_SAMPLE_L::~Inst_MIMG__IMAGE_SAMPLE_L()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_L
+
+    // --- description from .arch file ---
+    // sample texture map, with user LOD.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_L::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_B class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_B::Inst_MIMG__IMAGE_SAMPLE_B(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_b")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_B
+
+    Inst_MIMG__IMAGE_SAMPLE_B::~Inst_MIMG__IMAGE_SAMPLE_B()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_B
+
+    // --- description from .arch file ---
+    // sample texture map, with lod bias.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_B::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_B_CL class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_B_CL::Inst_MIMG__IMAGE_SAMPLE_B_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_b_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_B_CL
+
+    Inst_MIMG__IMAGE_SAMPLE_B_CL::~Inst_MIMG__IMAGE_SAMPLE_B_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_B_CL
+
+    // --- description from .arch file ---
+    // sample texture map, with LOD clamp specified in shader, with lod bias.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_B_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_LZ class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_LZ::Inst_MIMG__IMAGE_SAMPLE_LZ(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_lz")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_LZ
+
+    Inst_MIMG__IMAGE_SAMPLE_LZ::~Inst_MIMG__IMAGE_SAMPLE_LZ()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_LZ
+
+    // --- description from .arch file ---
+    // sample texture map, from level 0.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_LZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C::Inst_MIMG__IMAGE_SAMPLE_C(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C
+
+    Inst_MIMG__IMAGE_SAMPLE_C::~Inst_MIMG__IMAGE_SAMPLE_C()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C
+
+    // --- description from .arch file ---
+    // sample texture map, with PCF.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_CL class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CL::Inst_MIMG__IMAGE_SAMPLE_C_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_CL
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CL::~Inst_MIMG__IMAGE_SAMPLE_C_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CL
+
+    // --- description from .arch file ---
+    // SAMPLE_C, with LOD clamp specified in shader.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_D class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_D::Inst_MIMG__IMAGE_SAMPLE_C_D(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_d")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_D
+
+    Inst_MIMG__IMAGE_SAMPLE_C_D::~Inst_MIMG__IMAGE_SAMPLE_C_D()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D
+
+    // --- description from .arch file ---
+    // SAMPLE_C, with user derivatives.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_D::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_D_CL class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_D_CL::Inst_MIMG__IMAGE_SAMPLE_C_D_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_d_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_D_CL
+
+    Inst_MIMG__IMAGE_SAMPLE_C_D_CL::~Inst_MIMG__IMAGE_SAMPLE_C_D_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D_CL
+
+    // --- description from .arch file ---
+    // SAMPLE_C, with LOD clamp specified in shader, with user derivatives.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_D_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_L class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_L::Inst_MIMG__IMAGE_SAMPLE_C_L(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_l")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_L
+
+    Inst_MIMG__IMAGE_SAMPLE_C_L::~Inst_MIMG__IMAGE_SAMPLE_C_L()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_L
+
+    // --- description from .arch file ---
+    // SAMPLE_C, with user LOD.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_L::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_B class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_B::Inst_MIMG__IMAGE_SAMPLE_C_B(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_b")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_B
+
+    Inst_MIMG__IMAGE_SAMPLE_C_B::~Inst_MIMG__IMAGE_SAMPLE_C_B()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B
+
+    // --- description from .arch file ---
+    // SAMPLE_C, with lod bias.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_B::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_B_CL class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_B_CL::Inst_MIMG__IMAGE_SAMPLE_C_B_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_b_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_B_CL
+
+    Inst_MIMG__IMAGE_SAMPLE_C_B_CL::~Inst_MIMG__IMAGE_SAMPLE_C_B_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B_CL
+
+    // --- description from .arch file ---
+    // SAMPLE_C, with LOD clamp specified in shader, with lod bias.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_B_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_LZ class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_LZ::Inst_MIMG__IMAGE_SAMPLE_C_LZ(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_lz")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_LZ
+
+    Inst_MIMG__IMAGE_SAMPLE_C_LZ::~Inst_MIMG__IMAGE_SAMPLE_C_LZ()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_LZ
+
+    // --- description from .arch file ---
+    // SAMPLE_C, from level 0.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_LZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_O::Inst_MIMG__IMAGE_SAMPLE_O(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_O
+
+    Inst_MIMG__IMAGE_SAMPLE_O::~Inst_MIMG__IMAGE_SAMPLE_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_O
+
+    // --- description from .arch file ---
+    // sample texture map, with user offsets.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_CL_O::Inst_MIMG__IMAGE_SAMPLE_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_CL_O
+
+    Inst_MIMG__IMAGE_SAMPLE_CL_O::~Inst_MIMG__IMAGE_SAMPLE_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_CL_O
+
+    // --- description from .arch file ---
+    // SAMPLE_O with LOD clamp specified in shader.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_D_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_D_O::Inst_MIMG__IMAGE_SAMPLE_D_O(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_d_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_D_O
+
+    Inst_MIMG__IMAGE_SAMPLE_D_O::~Inst_MIMG__IMAGE_SAMPLE_D_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_D_O
+
+    // --- description from .arch file ---
+    // SAMPLE_O, with user derivatives.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_D_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_D_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_D_CL_O::Inst_MIMG__IMAGE_SAMPLE_D_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_d_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_D_CL_O
+
+    Inst_MIMG__IMAGE_SAMPLE_D_CL_O::~Inst_MIMG__IMAGE_SAMPLE_D_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_D_CL_O
+
+    // --- description from .arch file ---
+    // SAMPLE_O, with LOD clamp specified in shader, with user derivatives.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_D_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_L_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_L_O::Inst_MIMG__IMAGE_SAMPLE_L_O(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_l_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_L_O
+
+    Inst_MIMG__IMAGE_SAMPLE_L_O::~Inst_MIMG__IMAGE_SAMPLE_L_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_L_O
+
+    // --- description from .arch file ---
+    // SAMPLE_O, with user LOD.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_L_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_B_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_B_O::Inst_MIMG__IMAGE_SAMPLE_B_O(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_b_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_B_O
+
+    Inst_MIMG__IMAGE_SAMPLE_B_O::~Inst_MIMG__IMAGE_SAMPLE_B_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_B_O
+
+    // --- description from .arch file ---
+    // SAMPLE_O, with lod bias.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_B_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_B_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_B_CL_O::Inst_MIMG__IMAGE_SAMPLE_B_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_b_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_B_CL_O
+
+    Inst_MIMG__IMAGE_SAMPLE_B_CL_O::~Inst_MIMG__IMAGE_SAMPLE_B_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_B_CL_O
+
+    // --- description from .arch file ---
+    // SAMPLE_O, with LOD clamp specified in shader, with lod bias.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_LZ_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_LZ_O::Inst_MIMG__IMAGE_SAMPLE_LZ_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_lz_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_LZ_O
+
+    Inst_MIMG__IMAGE_SAMPLE_LZ_O::~Inst_MIMG__IMAGE_SAMPLE_LZ_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_LZ_O
+
+    // --- description from .arch file ---
+    // SAMPLE_O, from level 0.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_LZ_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_O::Inst_MIMG__IMAGE_SAMPLE_C_O(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_O::~Inst_MIMG__IMAGE_SAMPLE_C_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C with user specified offsets.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_CL_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CL_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C_O, with LOD clamp specified in shader.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_D_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_D_O::Inst_MIMG__IMAGE_SAMPLE_C_D_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_d_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_D_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_D_O::~Inst_MIMG__IMAGE_SAMPLE_C_D_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C_O, with user derivatives.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_D_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_d_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C_O, with LOD clamp specified in shader, with user derivatives.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_L_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_L_O::Inst_MIMG__IMAGE_SAMPLE_C_L_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_l_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_L_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_L_O::~Inst_MIMG__IMAGE_SAMPLE_C_L_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_L_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C_O, with user LOD.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_L_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_B_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_B_O::Inst_MIMG__IMAGE_SAMPLE_C_B_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_b_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_B_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_B_O::~Inst_MIMG__IMAGE_SAMPLE_C_B_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C_O, with lod bias.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_B_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_b_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C_O, with LOD clamp specified in shader, with lod bias.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_LZ_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_LZ_O::Inst_MIMG__IMAGE_SAMPLE_C_LZ_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_lz_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_LZ_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_LZ_O::~Inst_MIMG__IMAGE_SAMPLE_C_LZ_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_LZ_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C_O, from level 0.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_LZ_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4 class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4::Inst_MIMG__IMAGE_GATHER4(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4
+
+    Inst_MIMG__IMAGE_GATHER4::~Inst_MIMG__IMAGE_GATHER4()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2).
+    void
+    Inst_MIMG__IMAGE_GATHER4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_CL class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_CL::Inst_MIMG__IMAGE_GATHER4_CL(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_CL
+
+    Inst_MIMG__IMAGE_GATHER4_CL::~Inst_MIMG__IMAGE_GATHER4_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_CL
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) with user LOD clamp.
+    void
+    Inst_MIMG__IMAGE_GATHER4_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_L class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_L::Inst_MIMG__IMAGE_GATHER4_L(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_l")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_L
+
+    Inst_MIMG__IMAGE_GATHER4_L::~Inst_MIMG__IMAGE_GATHER4_L()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_L
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) with user LOD.
+    void
+    Inst_MIMG__IMAGE_GATHER4_L::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_B class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_B::Inst_MIMG__IMAGE_GATHER4_B(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_b")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_B
+
+    Inst_MIMG__IMAGE_GATHER4_B::~Inst_MIMG__IMAGE_GATHER4_B()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_B
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) with user bias.
+    void
+    Inst_MIMG__IMAGE_GATHER4_B::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_B_CL class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_B_CL::Inst_MIMG__IMAGE_GATHER4_B_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_b_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_B_CL
+
+    Inst_MIMG__IMAGE_GATHER4_B_CL::~Inst_MIMG__IMAGE_GATHER4_B_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_B_CL
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) with user bias and clamp.
+    void
+    Inst_MIMG__IMAGE_GATHER4_B_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_LZ class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_LZ::Inst_MIMG__IMAGE_GATHER4_LZ(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_lz")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_LZ
+
+    Inst_MIMG__IMAGE_GATHER4_LZ::~Inst_MIMG__IMAGE_GATHER4_LZ()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_LZ
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) at level 0.
+    void
+    Inst_MIMG__IMAGE_GATHER4_LZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C::Inst_MIMG__IMAGE_GATHER4_C(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C
+
+    Inst_MIMG__IMAGE_GATHER4_C::~Inst_MIMG__IMAGE_GATHER4_C()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) with PCF.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_CL class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_CL::Inst_MIMG__IMAGE_GATHER4_C_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_CL
+
+    Inst_MIMG__IMAGE_GATHER4_C_CL::~Inst_MIMG__IMAGE_GATHER4_C_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_CL
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) with user LOD clamp and PCF.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_L class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_L::Inst_MIMG__IMAGE_GATHER4_C_L(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_l")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_L
+
+    Inst_MIMG__IMAGE_GATHER4_C_L::~Inst_MIMG__IMAGE_GATHER4_C_L()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_L
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) with user LOD and PCF.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_L::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_B class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_B::Inst_MIMG__IMAGE_GATHER4_C_B(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_b")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_B
+
+    Inst_MIMG__IMAGE_GATHER4_C_B::~Inst_MIMG__IMAGE_GATHER4_C_B()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_B
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) with user bias and PCF.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_B::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_B_CL class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_B_CL::Inst_MIMG__IMAGE_GATHER4_C_B_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_b_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_B_CL
+
+    Inst_MIMG__IMAGE_GATHER4_C_B_CL::~Inst_MIMG__IMAGE_GATHER4_C_B_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_B_CL
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) with user bias, clamp and PCF.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_B_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_LZ class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_LZ::Inst_MIMG__IMAGE_GATHER4_C_LZ(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_lz")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_LZ
+
+    Inst_MIMG__IMAGE_GATHER4_C_LZ::~Inst_MIMG__IMAGE_GATHER4_C_LZ()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_LZ
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) at level 0, with PCF.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_LZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_O::Inst_MIMG__IMAGE_GATHER4_O(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_O
+
+    Inst_MIMG__IMAGE_GATHER4_O::~Inst_MIMG__IMAGE_GATHER4_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_O
+
+    // --- description from .arch file ---
+    // GATHER4, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_CL_O::Inst_MIMG__IMAGE_GATHER4_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_CL_O
+
+    Inst_MIMG__IMAGE_GATHER4_CL_O::~Inst_MIMG__IMAGE_GATHER4_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_CL_O
+
+    // --- description from .arch file ---
+    // GATHER4_CL, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_L_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_L_O::Inst_MIMG__IMAGE_GATHER4_L_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_l_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_L_O
+
+    Inst_MIMG__IMAGE_GATHER4_L_O::~Inst_MIMG__IMAGE_GATHER4_L_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_L_O
+
+    // --- description from .arch file ---
+    // GATHER4_L, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_L_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_B_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_B_O::Inst_MIMG__IMAGE_GATHER4_B_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_b_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_B_O
+
+    Inst_MIMG__IMAGE_GATHER4_B_O::~Inst_MIMG__IMAGE_GATHER4_B_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_B_O
+
+    // --- description from .arch file ---
+    // GATHER4_B, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_B_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_B_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_B_CL_O::Inst_MIMG__IMAGE_GATHER4_B_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_b_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_B_CL_O
+
+    Inst_MIMG__IMAGE_GATHER4_B_CL_O::~Inst_MIMG__IMAGE_GATHER4_B_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_B_CL_O
+
+    // --- description from .arch file ---
+    // GATHER4_B_CL, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_LZ_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_LZ_O::Inst_MIMG__IMAGE_GATHER4_LZ_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_lz_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_LZ_O
+
+    Inst_MIMG__IMAGE_GATHER4_LZ_O::~Inst_MIMG__IMAGE_GATHER4_LZ_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_LZ_O
+
+    // --- description from .arch file ---
+    // GATHER4_LZ, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_LZ_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_O::Inst_MIMG__IMAGE_GATHER4_C_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_O
+
+    Inst_MIMG__IMAGE_GATHER4_C_O::~Inst_MIMG__IMAGE_GATHER4_C_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_O
+
+    // --- description from .arch file ---
+    // GATHER4_C, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_CL_O::Inst_MIMG__IMAGE_GATHER4_C_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_CL_O
+
+    Inst_MIMG__IMAGE_GATHER4_C_CL_O::~Inst_MIMG__IMAGE_GATHER4_C_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_CL_O
+
+    // --- description from .arch file ---
+    // GATHER4_C_CL, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_L_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_L_O::Inst_MIMG__IMAGE_GATHER4_C_L_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_l_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_L_O
+
+    Inst_MIMG__IMAGE_GATHER4_C_L_O::~Inst_MIMG__IMAGE_GATHER4_C_L_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_L_O
+
+    // --- description from .arch file ---
+    // GATHER4_C_L, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_L_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_B_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_B_O::Inst_MIMG__IMAGE_GATHER4_C_B_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_b_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_B_O
+
+    Inst_MIMG__IMAGE_GATHER4_C_B_O::~Inst_MIMG__IMAGE_GATHER4_C_B_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_B_O
+
+    // --- description from .arch file ---
+    // GATHER4_B, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_B_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_B_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_B_CL_O::Inst_MIMG__IMAGE_GATHER4_C_B_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_b_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_B_CL_O
+
+    Inst_MIMG__IMAGE_GATHER4_C_B_CL_O::~Inst_MIMG__IMAGE_GATHER4_C_B_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_B_CL_O
+
+    // --- description from .arch file ---
+    // GATHER4_B_CL, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_LZ_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_LZ_O::Inst_MIMG__IMAGE_GATHER4_C_LZ_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_lz_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_LZ_O
+
+    Inst_MIMG__IMAGE_GATHER4_C_LZ_O::~Inst_MIMG__IMAGE_GATHER4_C_LZ_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_LZ_O
+
+    // --- description from .arch file ---
+    // GATHER4_C_LZ, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_LZ_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GET_LOD class methods ---
+
+    Inst_MIMG__IMAGE_GET_LOD::Inst_MIMG__IMAGE_GET_LOD(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_get_lod")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GET_LOD
+
+    Inst_MIMG__IMAGE_GET_LOD::~Inst_MIMG__IMAGE_GET_LOD()
+    {
+    } // ~Inst_MIMG__IMAGE_GET_LOD
+
+    // --- description from .arch file ---
+    // Return calculated LOD. Vdata gets 2 32bit integer values: { rawLOD,
+    // ---  clampedLOD }.
+    void
+    Inst_MIMG__IMAGE_GET_LOD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_CD class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_CD::Inst_MIMG__IMAGE_SAMPLE_CD(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_cd")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_CD
+
+    Inst_MIMG__IMAGE_SAMPLE_CD::~Inst_MIMG__IMAGE_SAMPLE_CD()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_CD
+
+    // --- description from .arch file ---
+    // sample texture map, with user derivatives (LOD per quad)
+    void
+    Inst_MIMG__IMAGE_SAMPLE_CD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_CD_CL class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_CD_CL::Inst_MIMG__IMAGE_SAMPLE_CD_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_cd_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_CD_CL
+
+    Inst_MIMG__IMAGE_SAMPLE_CD_CL::~Inst_MIMG__IMAGE_SAMPLE_CD_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_CD_CL
+
+    // --- description from .arch file ---
+    // sample texture map, with LOD clamp specified in shader, with user
+    // ---  derivatives (LOD per quad).
+    void
+    Inst_MIMG__IMAGE_SAMPLE_CD_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_CD class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CD::Inst_MIMG__IMAGE_SAMPLE_C_CD(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_cd")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_CD
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CD::~Inst_MIMG__IMAGE_SAMPLE_C_CD()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD
+
+    // --- description from .arch file ---
+    // SAMPLE_C, with user derivatives (LOD per quad).
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_CD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_CD_CL class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL::Inst_MIMG__IMAGE_SAMPLE_C_CD_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_cd_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_CD_CL
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL::~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL
+
+    // --- description from .arch file ---
+    // SAMPLE_C, with LOD clamp specified in shader, with user derivatives
+    // (LOD per quad).
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_CD_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_CD_O::Inst_MIMG__IMAGE_SAMPLE_CD_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_cd_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_CD_O
+
+    Inst_MIMG__IMAGE_SAMPLE_CD_O::~Inst_MIMG__IMAGE_SAMPLE_CD_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_CD_O
+
+    // --- description from .arch file ---
+    // SAMPLE_O, with user derivatives (LOD per quad).
+    void
+    Inst_MIMG__IMAGE_SAMPLE_CD_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_CD_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_CD_CL_O::Inst_MIMG__IMAGE_SAMPLE_CD_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_cd_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_CD_CL_O
+
+    Inst_MIMG__IMAGE_SAMPLE_CD_CL_O::~Inst_MIMG__IMAGE_SAMPLE_CD_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_CD_CL_O
+
+    // --- description from .arch file ---
+    // SAMPLE_O, with LOD clamp specified in shader, with user derivatives
+    // (LOD per quad).
+    void
+    Inst_MIMG__IMAGE_SAMPLE_CD_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_CD_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CD_O::Inst_MIMG__IMAGE_SAMPLE_C_CD_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_cd_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_CD_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CD_O::~Inst_MIMG__IMAGE_SAMPLE_C_CD_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C_O, with user derivatives (LOD per quad).
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_CD_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_cd_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C_O, with LOD clamp specified in shader, with user derivatives
+    // (LOD per quad).
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/mtbuf.cc b/src/arch/amdgpu/vega/insts/mtbuf.cc
new file mode 100644
index 0000000000..2b37dfd6b9
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/mtbuf.cc
@@ -0,0 +1,584 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_X class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_X(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_x")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_X()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
+
+    // --- description from .arch file ---
+    // Typed buffer load 1 dword with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_xy")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
+
+    // --- description from .arch file ---
+    // Typed buffer load 2 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_xyz")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
+
+    // --- description from .arch file ---
+    // Typed buffer load 3 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_xyzw")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
+        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
+
+    // --- description from .arch file ---
+    // Typed buffer load 4 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_X class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_X
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_X(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_x")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_X
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::~Inst_MTBUF__TBUFFER_STORE_FORMAT_X()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_X
+
+    // --- description from .arch file ---
+    // Typed buffer store 1 dword with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_XY class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XY(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_xy")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XY()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
+
+    // --- description from .arch file ---
+    // Typed buffer store 2 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_xyz")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
+        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
+
+    // --- description from .arch file ---
+    // Typed buffer store 3 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_xyzw")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
+        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
+
+    // --- description from .arch file ---
+    // Typed buffer store 4 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_x")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::
+        ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
+
+    // --- description from .arch file ---
+    // Typed buffer load 1 dword with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::initiateAcc(
+          GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xy")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
+        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
+
+    // --- description from .arch file ---
+    // Typed buffer load 2 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ(
+          InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xyz")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
+        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
+
+    // --- description from .arch file ---
+    // Typed buffer load 3 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW(
+          InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xyzw")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
+        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
+
+    // --- description from .arch file ---
+    // Typed buffer load 4 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_x")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
+        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
+
+    // --- description from .arch file ---
+    // Typed buffer store 1 dword with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xy")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
+        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
+
+    // --- description from .arch file ---
+    // Typed buffer store 2 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xyz")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
+        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
+
+    // --- description from .arch file ---
+    // Typed buffer store 3 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::initiateAcc(
+          GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xyzw")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
+        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
+
+    // --- description from .arch file ---
+    // Typed buffer store 4 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::execute(
+        GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/mubuf.cc b/src/arch/amdgpu/vega/insts/mubuf.cc
new file mode 100644
index 0000000000..ff8bae2475
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/mubuf.cc
@@ -0,0 +1,2789 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_X class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_X
+        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_X(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_format_x")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_X
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::~Inst_MUBUF__BUFFER_LOAD_FORMAT_X()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_X
+
+    // --- description from .arch file ---
+    // Untyped buffer load 1 dword with format conversion.
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_XY class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY
+        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_XY(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_format_xy")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_XY
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::~Inst_MUBUF__BUFFER_LOAD_FORMAT_XY()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XY
+
+    // --- description from .arch file ---
+    // Untyped buffer load 2 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ
+        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_format_xyz")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ
+
+    // --- description from .arch file ---
+    // Untyped buffer load 3 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW
+        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_format_xyzw")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW
+
+    // --- description from .arch file ---
+    // Untyped buffer load 4 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_X class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_X
+        ::Inst_MUBUF__BUFFER_STORE_FORMAT_X(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_format_x")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_STORE_FORMAT_X
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_X::~Inst_MUBUF__BUFFER_STORE_FORMAT_X()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_X
+
+    // --- description from .arch file ---
+    // Untyped buffer store 1 dword with format conversion.
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_XY class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XY
+        ::Inst_MUBUF__BUFFER_STORE_FORMAT_XY(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_format_xy")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_STORE_FORMAT_XY
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::~Inst_MUBUF__BUFFER_STORE_FORMAT_XY()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_XY
+
+    // --- description from .arch file ---
+    // Untyped buffer store 2 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ
+        ::Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_format_xyz")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ
+
+    // --- description from .arch file ---
+    // Untyped buffer store 3 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
+        ::Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_format_xyzw")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
+        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
+
+    // --- description from .arch file ---
+    // Untyped buffer store 4 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
+        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_format_d16_x")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
+        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
+
+    // --- description from .arch file ---
+    // Untyped buffer load 1 dword with format conversion.
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
+        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_format_d16_xy")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
+        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
+
+    // --- description from .arch file ---
+    // Untyped buffer load 2 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
+        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_format_d16_xyz")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
+        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
+
+    // --- description from .arch file ---
+    // Untyped buffer load 3 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
+        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_format_d16_xyzw")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
+        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
+
+    // --- description from .arch file ---
+    // Untyped buffer load 4 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
+        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_format_d16_x")
+    {
+        setFlag(Store);
+    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
+        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
+
+    // --- description from .arch file ---
+    // Untyped buffer store 1 dword with format conversion.
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
+        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_format_d16_xy")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
+        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
+
+    // --- description from .arch file ---
+    // Untyped buffer store 2 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
+        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_format_d16_xyz")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
+        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
+
+    // --- description from .arch file ---
+    // Untyped buffer store 3 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
+        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_format_d16_xyzw")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
+        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
+
+    // --- description from .arch file ---
+    // Untyped buffer store 4 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_UBYTE class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_UBYTE
+        ::Inst_MUBUF__BUFFER_LOAD_UBYTE(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_ubyte")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_LOAD_UBYTE
+
+    Inst_MUBUF__BUFFER_LOAD_UBYTE::~Inst_MUBUF__BUFFER_LOAD_UBYTE()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_UBYTE
+
+    // --- description from .arch file ---
+    // Untyped buffer load unsigned byte (zero extend to VGPR destination).
+    void
+    Inst_MUBUF__BUFFER_LOAD_UBYTE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+
+        rsrcDesc.read();
+        offset.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_UBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<VecElemU8>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_UBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDATA);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                if (!oobMask[lane]) {
+                    vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU8*>(
+                        gpuDynInst->d_data))[lane]);
+                } else {
+                    vdst[lane] = 0;
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+
+    // --- Inst_MUBUF__BUFFER_LOAD_SBYTE class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_SBYTE
+        ::Inst_MUBUF__BUFFER_LOAD_SBYTE(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_sbyte")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_SBYTE
+
+    Inst_MUBUF__BUFFER_LOAD_SBYTE::~Inst_MUBUF__BUFFER_LOAD_SBYTE()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_SBYTE
+
+    // --- description from .arch file ---
+    // Untyped buffer load signed byte (sign extend to VGPR destination).
+    void
+    Inst_MUBUF__BUFFER_LOAD_SBYTE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_SBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_SBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_USHORT class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_USHORT
+        ::Inst_MUBUF__BUFFER_LOAD_USHORT(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_ushort")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_LOAD_USHORT
+
+    Inst_MUBUF__BUFFER_LOAD_USHORT::~Inst_MUBUF__BUFFER_LOAD_USHORT()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_USHORT
+
+    // --- description from .arch file ---
+    // Untyped buffer load unsigned short (zero extend to VGPR destination).
+    void
+    Inst_MUBUF__BUFFER_LOAD_USHORT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+
+        rsrcDesc.read();
+        offset.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_USHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<VecElemU16>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_USHORT::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDATA);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                if (!oobMask[lane]) {
+                    vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU16*>(
+                        gpuDynInst->d_data))[lane]);
+                } else {
+                    vdst[lane] = 0;
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+
+    // --- Inst_MUBUF__BUFFER_LOAD_SSHORT class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_SSHORT
+        ::Inst_MUBUF__BUFFER_LOAD_SSHORT(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_sshort")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_SSHORT
+
+    Inst_MUBUF__BUFFER_LOAD_SSHORT::~Inst_MUBUF__BUFFER_LOAD_SSHORT()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_SSHORT
+
+    // --- description from .arch file ---
+    // Untyped buffer load signed short (sign extend to VGPR destination).
+    void
+    Inst_MUBUF__BUFFER_LOAD_SSHORT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_SSHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_SSHORT::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_DWORD class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_DWORD
+        ::Inst_MUBUF__BUFFER_LOAD_DWORD(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_dword")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_LOAD_DWORD
+
+    Inst_MUBUF__BUFFER_LOAD_DWORD::~Inst_MUBUF__BUFFER_LOAD_DWORD()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_DWORD
+
+    // --- description from .arch file ---
+    // Untyped buffer load dword.
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+
+        rsrcDesc.read();
+        offset.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDATA);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                if (!oobMask[lane]) {
+                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane];
+                } else {
+                    vdst[lane] = 0;
+                }
+            }
+        }
+
+        vdst.write();
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_LOAD_DWORDX2 class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_DWORDX2
+        ::Inst_MUBUF__BUFFER_LOAD_DWORDX2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_dwordx2")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_LOAD_DWORDX2
+
+    Inst_MUBUF__BUFFER_LOAD_DWORDX2::~Inst_MUBUF__BUFFER_LOAD_DWORDX2()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_DWORDX2
+
+    // --- description from .arch file ---
+    // Untyped buffer load 2 dwords.
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+
+        rsrcDesc.read();
+        offset.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<2>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                if (!oobMask[lane]) {
+                    vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane * 2];
+                    vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane * 2 + 1];
+                } else {
+                    vdst0[lane] = 0;
+                    vdst1[lane] = 0;
+                }
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_LOAD_DWORDX3 class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_DWORDX3
+        ::Inst_MUBUF__BUFFER_LOAD_DWORDX3(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_dwordx3")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_LOAD_DWORDX3
+
+    Inst_MUBUF__BUFFER_LOAD_DWORDX3::~Inst_MUBUF__BUFFER_LOAD_DWORDX3()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_DWORDX3
+
+    // --- description from .arch file ---
+    // Untyped buffer load 3 dwords.
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+
+        rsrcDesc.read();
+        offset.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<3>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
+        VecOperandU32 vdst2(gpuDynInst, extData.VDATA + 2);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                if (!oobMask[lane]) {
+                    vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane * 3];
+                    vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane * 3 + 1];
+                    vdst2[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane * 3 + 2];
+                } else {
+                    vdst0[lane] = 0;
+                    vdst1[lane] = 0;
+                    vdst2[lane] = 0;
+                }
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+        vdst2.write();
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_LOAD_DWORDX4 class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_DWORDX4
+        ::Inst_MUBUF__BUFFER_LOAD_DWORDX4(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_dwordx4")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_LOAD_DWORDX4
+
+    Inst_MUBUF__BUFFER_LOAD_DWORDX4::~Inst_MUBUF__BUFFER_LOAD_DWORDX4()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_DWORDX4
+
+    // --- description from .arch file ---
+    // Untyped buffer load 4 dwords.
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+
+        rsrcDesc.read();
+        offset.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<4>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
+        VecOperandU32 vdst2(gpuDynInst, extData.VDATA + 2);
+        VecOperandU32 vdst3(gpuDynInst, extData.VDATA + 3);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                if (!oobMask[lane]) {
+                    vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane * 4];
+                    vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane * 4 + 1];
+                    vdst2[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane * 4 + 2];
+                    vdst3[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane * 4 + 3];
+                } else {
+                    vdst0[lane] = 0;
+                    vdst1[lane] = 0;
+                    vdst2[lane] = 0;
+                    vdst3[lane] = 0;
+                }
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+        vdst2.write();
+        vdst3.write();
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_STORE_BYTE class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_BYTE
+        ::Inst_MUBUF__BUFFER_STORE_BYTE(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_byte")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_STORE_BYTE
+
+    Inst_MUBUF__BUFFER_STORE_BYTE::~Inst_MUBUF__BUFFER_STORE_BYTE()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_BYTE
+
+    // --- description from .arch file ---
+    // Untyped buffer store byte.
+    void
+    Inst_MUBUF__BUFFER_STORE_BYTE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+        ConstVecOperandI8 data(gpuDynInst, extData.VDATA);
+
+        rsrcDesc.read();
+        offset.read();
+        data.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+       gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemI8*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_BYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<VecElemI8>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_BYTE::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_SHORT class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_SHORT
+        ::Inst_MUBUF__BUFFER_STORE_SHORT(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_short")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_STORE_SHORT
+
+    Inst_MUBUF__BUFFER_STORE_SHORT::~Inst_MUBUF__BUFFER_STORE_SHORT()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_SHORT
+
+    // --- description from .arch file ---
+    // Untyped buffer store short.
+    void
+    Inst_MUBUF__BUFFER_STORE_SHORT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+        ConstVecOperandI16 data(gpuDynInst, extData.VDATA);
+
+        rsrcDesc.read();
+        offset.read();
+        data.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemI16*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_SHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<VecElemI16>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_SHORT::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_DWORD class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_DWORD::
+        Inst_MUBUF__BUFFER_STORE_DWORD(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_dword")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_STORE_DWORD
+
+    Inst_MUBUF__BUFFER_STORE_DWORD::~Inst_MUBUF__BUFFER_STORE_DWORD()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_DWORD
+
+    // --- description from .arch file ---
+    // Untyped buffer store dword.
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+        ConstVecOperandU32 data(gpuDynInst, extData.VDATA);
+
+        rsrcDesc.read();
+        offset.read();
+        data.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_STORE_DWORDX2 class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_DWORDX2
+        ::Inst_MUBUF__BUFFER_STORE_DWORDX2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_dwordx2")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_STORE_DWORDX2
+
+    Inst_MUBUF__BUFFER_STORE_DWORDX2::~Inst_MUBUF__BUFFER_STORE_DWORDX2()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_DWORDX2
+
+    // --- description from .arch file ---
+    // Untyped buffer store 2 dwords.
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+        ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
+        ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
+
+        rsrcDesc.read();
+        offset.read();
+        data0.read();
+        data1.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
+                    = data0[lane];
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
+                    = data1[lane];
+            }
+        }
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<2>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_STORE_DWORDX3 class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_DWORDX3
+        ::Inst_MUBUF__BUFFER_STORE_DWORDX3(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_dwordx3")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_STORE_DWORDX3
+
+    Inst_MUBUF__BUFFER_STORE_DWORDX3::~Inst_MUBUF__BUFFER_STORE_DWORDX3()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_DWORDX3
+
+    // --- description from .arch file ---
+    // Untyped buffer store 3 dwords.
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+        ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
+        ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
+        ConstVecOperandU32 data2(gpuDynInst, extData.VDATA + 2);
+
+        rsrcDesc.read();
+        offset.read();
+        data0.read();
+        data1.read();
+        data2.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
+                    = data0[lane];
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
+                    = data1[lane];
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 2]
+                    = data2[lane];
+            }
+        }
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<3>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_STORE_DWORDX4 class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_DWORDX4
+        ::Inst_MUBUF__BUFFER_STORE_DWORDX4(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_dwordx4")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_STORE_DWORDX4
+
+    Inst_MUBUF__BUFFER_STORE_DWORDX4::~Inst_MUBUF__BUFFER_STORE_DWORDX4()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_DWORDX4
+
+    // --- description from .arch file ---
+    // Untyped buffer store 4 dwords.
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+        ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
+        ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
+        ConstVecOperandU32 data2(gpuDynInst, extData.VDATA + 2);
+        ConstVecOperandU32 data3(gpuDynInst, extData.VDATA + 3);
+
+        rsrcDesc.read();
+        offset.read();
+        data0.read();
+        data1.read();
+        data2.read();
+        data3.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
+                    = data0[lane];
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
+                    = data1[lane];
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 2]
+                    = data2[lane];
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 3]
+                    = data3[lane];
+            }
+        }
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<4>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_STORE_LDS_DWORD class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_LDS_DWORD
+        ::Inst_MUBUF__BUFFER_STORE_LDS_DWORD(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_lds_dword")
+    {
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_STORE_LDS_DWORD
+
+    Inst_MUBUF__BUFFER_STORE_LDS_DWORD::~Inst_MUBUF__BUFFER_STORE_LDS_DWORD()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_LDS_DWORD
+
+    // --- description from .arch file ---
+    // Store one DWORD from LDS memory to system memory without utilizing
+    // VGPRs.
+    void
+    Inst_MUBUF__BUFFER_STORE_LDS_DWORD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_WBINVL1 class methods ---
+
+    Inst_MUBUF__BUFFER_WBINVL1::Inst_MUBUF__BUFFER_WBINVL1(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_wbinvl1")
+    {
+        setFlag(MemoryRef);
+        setFlag(GPUStaticInst::MemSync);
+        setFlag(GlobalSegment);
+        setFlag(MemSync);
+    } // Inst_MUBUF__BUFFER_WBINVL1
+
+    Inst_MUBUF__BUFFER_WBINVL1::~Inst_MUBUF__BUFFER_WBINVL1()
+    {
+    } // ~Inst_MUBUF__BUFFER_WBINVL1
+
+    // --- description from .arch file ---
+    // Write back and invalidate the shader L1.
+    // Always returns ACK to shader.
+    void
+    Inst_MUBUF__BUFFER_WBINVL1::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+            gpuDynInst->computeUnit()->globalMemoryPipe.
+                issueRequest(gpuDynInst);
+        } else {
+            fatal("Unsupported scope for flat instruction.\n");
+        }
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_WBINVL1::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        // TODO: Fix it for gfx10. Once we have the new gfx10 cache model, we
+        // need to precisely communicate the writeback-invalidate operation to
+        // the new gfx10 coalescer rather than sending AcquireRelease markers.
+        // The SICoalescer would need to be updated appropriately as well.
+        injectGlobalMemFence(gpuDynInst);
+    } // initiateAcc
+    void
+    Inst_MUBUF__BUFFER_WBINVL1::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_WBINVL1_VOL class methods ---
+
+    Inst_MUBUF__BUFFER_WBINVL1_VOL
+        ::Inst_MUBUF__BUFFER_WBINVL1_VOL(InFmt_MUBUF*iFmt)
+        : Inst_MUBUF(iFmt, "buffer_wbinvl1_vol") {
+        // This instruction is same as buffer_wbinvl1 instruction except this
+        // instruction only invalidate L1 shader line with MTYPE SC and GC.
+        // Since Hermes L1 (TCP) do not differentiate between its cache lines,
+        // this instruction currently behaves (and implemented ) exactly like
+        // buffer_wbinvl1 instruction.
+        setFlag(MemoryRef);
+        setFlag(GPUStaticInst::MemSync);
+        setFlag(GlobalSegment);
+        setFlag(MemSync);
+    } // Inst_MUBUF__BUFFER_WBINVL1_VOL
+
+    Inst_MUBUF__BUFFER_WBINVL1_VOL::~Inst_MUBUF__BUFFER_WBINVL1_VOL()
+    {
+    } // ~Inst_MUBUF__BUFFER_WBINVL1_VOL
+
+    // --- description from .arch file ---
+    // Write back and invalidate the shader L1 only for lines that are marked
+    // ---  volatile.
+    // Always returns ACK to shader.
+    void
+    Inst_MUBUF__BUFFER_WBINVL1_VOL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+            gpuDynInst->computeUnit()->globalMemoryPipe.
+                issueRequest(gpuDynInst);
+        } else {
+            fatal("Unsupported scope for flat instruction.\n");
+        }
+    } // execute
+    void
+    Inst_MUBUF__BUFFER_WBINVL1_VOL::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        injectGlobalMemFence(gpuDynInst);
+    } // initiateAcc
+    void
+    Inst_MUBUF__BUFFER_WBINVL1_VOL::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_ATOMIC_SWAP class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_SWAP
+        ::Inst_MUBUF__BUFFER_ATOMIC_SWAP(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_swap")
+    {
+        setFlag(AtomicExch);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_SWAP
+
+    Inst_MUBUF__BUFFER_ATOMIC_SWAP::~Inst_MUBUF__BUFFER_ATOMIC_SWAP()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_SWAP
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_SWAP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP
+        ::Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_cmpswap")
+    {
+        setFlag(AtomicCAS);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP
+
+    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // src = DATA[0];
+    // cmp = DATA[1];
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+        ConstVecOperandU32 src(gpuDynInst, extData.VDATA);
+        ConstVecOperandU32 cmp(gpuDynInst, extData.VDATA + 1);
+
+        rsrcDesc.read();
+        offset.read();
+        src.read();
+        cmp.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->x_data))[lane]
+                    = src[lane];
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
+                    = cmp[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        if (isAtomicRet()) {
+            VecOperandU32 vdst(gpuDynInst, extData.VDATA);
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (gpuDynInst->exec_mask[lane]) {
+                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane];
+                }
+            }
+
+            vdst.write();
+        }
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_ATOMIC_ADD class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_ADD
+        ::Inst_MUBUF__BUFFER_ATOMIC_ADD(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_add")
+    {
+        setFlag(AtomicAdd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_ADD
+
+    Inst_MUBUF__BUFFER_ATOMIC_ADD::~Inst_MUBUF__BUFFER_ATOMIC_ADD()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_ADD
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] += DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_ADD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_SUB class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_SUB
+        ::Inst_MUBUF__BUFFER_ATOMIC_SUB(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_sub")
+    {
+        setFlag(AtomicSub);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_SUB
+
+    Inst_MUBUF__BUFFER_ATOMIC_SUB::~Inst_MUBUF__BUFFER_ATOMIC_SUB()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_SUB
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_SMIN class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_SMIN
+        ::Inst_MUBUF__BUFFER_ATOMIC_SMIN(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_smin")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_SMIN
+
+    Inst_MUBUF__BUFFER_ATOMIC_SMIN::~Inst_MUBUF__BUFFER_ATOMIC_SMIN()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMIN
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_SMIN::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_UMIN class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_UMIN
+        ::Inst_MUBUF__BUFFER_ATOMIC_UMIN(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_umin")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_UMIN
+
+    Inst_MUBUF__BUFFER_ATOMIC_UMIN::~Inst_MUBUF__BUFFER_ATOMIC_UMIN()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMIN
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_UMIN::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_SMAX class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_SMAX
+        ::Inst_MUBUF__BUFFER_ATOMIC_SMAX(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_smax")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_SMAX
+
+    Inst_MUBUF__BUFFER_ATOMIC_SMAX::~Inst_MUBUF__BUFFER_ATOMIC_SMAX()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMAX
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_SMAX::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_UMAX class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_UMAX
+        ::Inst_MUBUF__BUFFER_ATOMIC_UMAX(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_umax")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_UMAX
+
+    Inst_MUBUF__BUFFER_ATOMIC_UMAX::~Inst_MUBUF__BUFFER_ATOMIC_UMAX()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMAX
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_UMAX::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_AND class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_AND
+        ::Inst_MUBUF__BUFFER_ATOMIC_AND(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_and")
+    {
+        setFlag(AtomicAnd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_AND
+
+    Inst_MUBUF__BUFFER_ATOMIC_AND::~Inst_MUBUF__BUFFER_ATOMIC_AND()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_AND
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] &= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_AND::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_OR class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_OR
+        ::Inst_MUBUF__BUFFER_ATOMIC_OR(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_or")
+    {
+        setFlag(AtomicOr);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_OR
+
+    Inst_MUBUF__BUFFER_ATOMIC_OR::~Inst_MUBUF__BUFFER_ATOMIC_OR()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_OR
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] |= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_OR::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_XOR class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_XOR
+        ::Inst_MUBUF__BUFFER_ATOMIC_XOR(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_xor")
+    {
+        setFlag(AtomicXor);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_XOR
+
+    Inst_MUBUF__BUFFER_ATOMIC_XOR::~Inst_MUBUF__BUFFER_ATOMIC_XOR()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_XOR
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] ^= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_XOR::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_INC class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_INC
+        ::Inst_MUBUF__BUFFER_ATOMIC_INC(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_inc")
+    {
+        setFlag(AtomicInc);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_INC
+
+    Inst_MUBUF__BUFFER_ATOMIC_INC::~Inst_MUBUF__BUFFER_ATOMIC_INC()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_INC
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_DEC class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_DEC
+        ::Inst_MUBUF__BUFFER_ATOMIC_DEC(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_dec")
+    {
+        setFlag(AtomicDec);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_DEC
+
+    Inst_MUBUF__BUFFER_ATOMIC_DEC::~Inst_MUBUF__BUFFER_ATOMIC_DEC()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_DEC
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
+    // (unsigned compare); RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_swap_x2")
+    {
+        setFlag(AtomicExch);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2::~Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_cmpswap_x2")
+    {
+        setFlag(AtomicCAS);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
+        ::~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // src = DATA[0:1];
+    // cmp = DATA[2:3];
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_ADD_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_ADD_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_ADD_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_add_x2")
+    {
+        setFlag(AtomicAdd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_ADD_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_ADD_X2::~Inst_MUBUF__BUFFER_ATOMIC_ADD_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_ADD_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] += DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_ADD_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_SUB_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_SUB_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_SUB_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_sub_x2")
+    {
+        setFlag(AtomicSub);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_SUB_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_SUB_X2::~Inst_MUBUF__BUFFER_ATOMIC_SUB_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_SUB_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_SUB_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_smin_x2")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2::~Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_umin_x2")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2::~Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_smax_x2")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2::~Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_umax_x2")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2::~Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_AND_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_AND_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_AND_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_and_x2")
+    {
+        setFlag(AtomicAnd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_AND_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_AND_X2::~Inst_MUBUF__BUFFER_ATOMIC_AND_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_AND_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] &= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_AND_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_OR_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_OR_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_OR_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_or_x2")
+    {
+        setFlag(AtomicOr);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+    } // Inst_MUBUF__BUFFER_ATOMIC_OR_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_OR_X2::~Inst_MUBUF__BUFFER_ATOMIC_OR_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_OR_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] |= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_OR_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_XOR_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_XOR_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_XOR_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_xor_x2")
+    {
+        setFlag(AtomicXor);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_XOR_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_XOR_X2::~Inst_MUBUF__BUFFER_ATOMIC_XOR_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_XOR_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] ^= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_XOR_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_INC_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_INC_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_INC_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_inc_x2")
+    {
+        setFlag(AtomicInc);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_INC_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_INC_X2::~Inst_MUBUF__BUFFER_ATOMIC_INC_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_INC_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_INC_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_DEC_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_DEC_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_DEC_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_dec_x2")
+    {
+        setFlag(AtomicDec);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_DEC_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_DEC_X2::~Inst_MUBUF__BUFFER_ATOMIC_DEC_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_DEC_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
+    // (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_DEC_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/smem.cc b/src/arch/amdgpu/vega/insts/smem.cc
new file mode 100644
index 0000000000..a6af4f007d
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/smem.cc
@@ -0,0 +1,1013 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_SMEM__S_LOAD_DWORD class methods ---
+
+    Inst_SMEM__S_LOAD_DWORD::Inst_SMEM__S_LOAD_DWORD(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_load_dword")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_LOAD_DWORD
+
+    Inst_SMEM__S_LOAD_DWORD::~Inst_SMEM__S_LOAD_DWORD()
+    {
+    } // ~Inst_SMEM__S_LOAD_DWORD
+
+    /**
+     * Read 1 dword from scalar data cache. If the offset is specified as an
+     * sgpr, the sgpr contains an unsigned byte offset (the 2 LSBs are
+     * ignored). If the offset is specified as an immediate 20-bit constant,
+     * the constant is an unsigned byte offset.
+     */
+    void
+    Inst_SMEM__S_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+
+        addr.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, addr, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe
+            .issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<1>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_LOAD_DWORDX2 class methods ---
+
+    Inst_SMEM__S_LOAD_DWORDX2::Inst_SMEM__S_LOAD_DWORDX2(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_load_dwordx2")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_LOAD_DWORDX2
+
+    Inst_SMEM__S_LOAD_DWORDX2::~Inst_SMEM__S_LOAD_DWORDX2()
+    {
+    } // ~Inst_SMEM__S_LOAD_DWORDX2
+
+    /**
+     * Read 2 dwords from scalar data cache. See s_load_dword for details on
+     * the offset input.
+     */
+    void
+    Inst_SMEM__S_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+
+        addr.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, addr, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe.
+            issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<2>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_LOAD_DWORDX4 class methods ---
+
+    Inst_SMEM__S_LOAD_DWORDX4::Inst_SMEM__S_LOAD_DWORDX4(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_load_dwordx4")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_LOAD_DWORDX4
+
+    Inst_SMEM__S_LOAD_DWORDX4::~Inst_SMEM__S_LOAD_DWORDX4()
+    {
+    } // ~Inst_SMEM__S_LOAD_DWORDX4
+
+    // --- description from .arch file ---
+    // Read 4 dwords from scalar data cache. See S_LOAD_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+
+        addr.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, addr, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe.
+            issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<4>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarOperandU128 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_LOAD_DWORDX8 class methods ---
+
+    Inst_SMEM__S_LOAD_DWORDX8::Inst_SMEM__S_LOAD_DWORDX8(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_load_dwordx8")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_LOAD_DWORDX8
+
+    Inst_SMEM__S_LOAD_DWORDX8::~Inst_SMEM__S_LOAD_DWORDX8()
+    {
+    } // ~Inst_SMEM__S_LOAD_DWORDX8
+
+    // --- description from .arch file ---
+    // Read 8 dwords from scalar data cache. See S_LOAD_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_LOAD_DWORDX8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+
+        addr.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, addr, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe.
+            issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_LOAD_DWORDX8::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<8>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_LOAD_DWORDX8::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarOperandU256 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_LOAD_DWORDX16 class methods ---
+
+    Inst_SMEM__S_LOAD_DWORDX16::Inst_SMEM__S_LOAD_DWORDX16(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_load_dwordx16")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_LOAD_DWORDX16
+
+    Inst_SMEM__S_LOAD_DWORDX16::~Inst_SMEM__S_LOAD_DWORDX16()
+    {
+    } // ~Inst_SMEM__S_LOAD_DWORDX16
+
+    // --- description from .arch file ---
+    // Read 16 dwords from scalar data cache. See S_LOAD_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_LOAD_DWORDX16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+
+        addr.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, addr, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe.
+            issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_LOAD_DWORDX16::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<16>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_LOAD_DWORDX16::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarOperandU512 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_BUFFER_LOAD_DWORD class methods ---
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORD::Inst_SMEM__S_BUFFER_LOAD_DWORD(
+          InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_buffer_load_dword")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_BUFFER_LOAD_DWORD
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORD::~Inst_SMEM__S_BUFFER_LOAD_DWORD()
+    {
+    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORD
+
+    // --- description from .arch file ---
+    // Read 1 dword from scalar data cache. See S_LOAD_DWORD for details on the
+    // ---  offset input.
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
+
+        rsrcDesc.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, rsrcDesc, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe
+            .issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<1>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        // 1 request, size 32
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_BUFFER_LOAD_DWORDX2 class methods ---
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::Inst_SMEM__S_BUFFER_LOAD_DWORDX2(
+          InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_buffer_load_dwordx2")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX2
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::~Inst_SMEM__S_BUFFER_LOAD_DWORDX2()
+    {
+    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX2
+
+    // --- description from .arch file ---
+    // Read 2 dwords from scalar data cache. See S_LOAD_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
+
+        rsrcDesc.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, rsrcDesc, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe
+            .issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<2>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        // use U64 because 2 requests, each size 32
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_BUFFER_LOAD_DWORDX4 class methods ---
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::Inst_SMEM__S_BUFFER_LOAD_DWORDX4(
+          InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_buffer_load_dwordx4")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX4
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::~Inst_SMEM__S_BUFFER_LOAD_DWORDX4()
+    {
+    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX4
+
+    // --- description from .arch file ---
+    // Read 4 dwords from scalar data cache. See S_LOAD_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
+
+        rsrcDesc.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, rsrcDesc, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe
+            .issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<4>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        // 4 requests, each size 32
+        ScalarOperandU128 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_BUFFER_LOAD_DWORDX8 class methods ---
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::Inst_SMEM__S_BUFFER_LOAD_DWORDX8(
+          InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_buffer_load_dwordx8")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX8
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::~Inst_SMEM__S_BUFFER_LOAD_DWORDX8()
+    {
+    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX8
+
+    // --- description from .arch file ---
+    // Read 8 dwords from scalar data cache. See S_LOAD_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
+
+        rsrcDesc.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, rsrcDesc, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe
+            .issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<8>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        // 8 requests, each size 32
+        ScalarOperandU256 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_BUFFER_LOAD_DWORDX16 class methods ---
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::Inst_SMEM__S_BUFFER_LOAD_DWORDX16(
+          InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_buffer_load_dwordx16")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX16
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::~Inst_SMEM__S_BUFFER_LOAD_DWORDX16()
+    {
+    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX16
+
+    // --- description from .arch file ---
+    // Read 16 dwords from scalar data cache. See S_LOAD_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
+
+        rsrcDesc.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, rsrcDesc, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe
+            .issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<16>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        // 16 requests, each size 32
+        ScalarOperandU512 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_STORE_DWORD class methods ---
+
+    Inst_SMEM__S_STORE_DWORD::Inst_SMEM__S_STORE_DWORD(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_store_dword")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_SMEM__S_STORE_DWORD
+
+    Inst_SMEM__S_STORE_DWORD::~Inst_SMEM__S_STORE_DWORD()
+    {
+    } // ~Inst_SMEM__S_STORE_DWORD
+
+    // --- description from .arch file ---
+    // Write 1 dword to scalar data cache.
+    // If the offset is specified as an SGPR, the SGPR contains an unsigned
+    // BYTE offset (the 2 LSBs are ignored).
+    // If the offset is specified as an immediate 20-bit constant, the
+    // constant is an unsigned BYTE offset.
+    void
+    Inst_SMEM__S_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+        ConstScalarOperandU32 sdata(gpuDynInst, instData.SDATA);
+
+        addr.read();
+        sdata.read();
+
+        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
+            sizeof(ScalarRegU32));
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, addr, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe.
+            issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<1>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_SMEM__S_STORE_DWORDX2 class methods ---
+
+    Inst_SMEM__S_STORE_DWORDX2::Inst_SMEM__S_STORE_DWORDX2(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_store_dwordx2")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_SMEM__S_STORE_DWORDX2
+
+    Inst_SMEM__S_STORE_DWORDX2::~Inst_SMEM__S_STORE_DWORDX2()
+    {
+    } // ~Inst_SMEM__S_STORE_DWORDX2
+
+    // --- description from .arch file ---
+    // Write 2 dwords to scalar data cache. See S_STORE_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+        ConstScalarOperandU64 sdata(gpuDynInst, instData.SDATA);
+
+        addr.read();
+        sdata.read();
+
+        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
+            sizeof(ScalarRegU64));
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, addr, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe.
+            issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<2>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_SMEM__S_STORE_DWORDX4 class methods ---
+
+    Inst_SMEM__S_STORE_DWORDX4::Inst_SMEM__S_STORE_DWORDX4(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_store_dwordx4")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_SMEM__S_STORE_DWORDX4
+
+    Inst_SMEM__S_STORE_DWORDX4::~Inst_SMEM__S_STORE_DWORDX4()
+    {
+    } // ~Inst_SMEM__S_STORE_DWORDX4
+
+    // --- description from .arch file ---
+    // Write 4 dwords to scalar data cache. See S_STORE_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+        ConstScalarOperandU64 sdata(gpuDynInst, instData.SDATA);
+
+        addr.read();
+        sdata.read();
+
+        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
+            sizeof(gpuDynInst->scalar_data));
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, addr, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe.
+            issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<4>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_SMEM__S_BUFFER_STORE_DWORD class methods ---
+
+    Inst_SMEM__S_BUFFER_STORE_DWORD::Inst_SMEM__S_BUFFER_STORE_DWORD(
+          InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_buffer_store_dword")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_SMEM__S_BUFFER_STORE_DWORD
+
+    Inst_SMEM__S_BUFFER_STORE_DWORD::~Inst_SMEM__S_BUFFER_STORE_DWORD()
+    {
+    } // ~Inst_SMEM__S_BUFFER_STORE_DWORD
+
+    // --- description from .arch file ---
+    // Write 1 dword to scalar data cache. See S_STORE_DWORD for details on the
+    // ---  offset input.
+    void
+    Inst_SMEM__S_BUFFER_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_SMEM__S_BUFFER_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_BUFFER_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_SMEM__S_BUFFER_STORE_DWORDX2 class methods ---
+
+    Inst_SMEM__S_BUFFER_STORE_DWORDX2::Inst_SMEM__S_BUFFER_STORE_DWORDX2(
+          InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_buffer_store_dwordx2")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_SMEM__S_BUFFER_STORE_DWORDX2
+
+    Inst_SMEM__S_BUFFER_STORE_DWORDX2::~Inst_SMEM__S_BUFFER_STORE_DWORDX2()
+    {
+    } // ~Inst_SMEM__S_BUFFER_STORE_DWORDX2
+
+    // --- description from .arch file ---
+    // Write 2 dwords to scalar data cache. See S_STORE_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_BUFFER_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_SMEM__S_BUFFER_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_BUFFER_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_SMEM__S_BUFFER_STORE_DWORDX4 class methods ---
+
+    Inst_SMEM__S_BUFFER_STORE_DWORDX4::Inst_SMEM__S_BUFFER_STORE_DWORDX4(
+          InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_buffer_store_dwordx4")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_SMEM__S_BUFFER_STORE_DWORDX4
+
+    Inst_SMEM__S_BUFFER_STORE_DWORDX4::~Inst_SMEM__S_BUFFER_STORE_DWORDX4()
+    {
+    } // ~Inst_SMEM__S_BUFFER_STORE_DWORDX4
+
+    // --- description from .arch file ---
+    // Write 4 dwords to scalar data cache. See S_STORE_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_BUFFER_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_SMEM__S_BUFFER_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_BUFFER_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_SMEM__S_DCACHE_INV class methods ---
+
+    Inst_SMEM__S_DCACHE_INV::Inst_SMEM__S_DCACHE_INV(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_dcache_inv")
+    {
+    } // Inst_SMEM__S_DCACHE_INV
+
+    Inst_SMEM__S_DCACHE_INV::~Inst_SMEM__S_DCACHE_INV()
+    {
+    } // ~Inst_SMEM__S_DCACHE_INV
+
+    // --- description from .arch file ---
+    // Invalidate the scalar data cache.
+    void
+    Inst_SMEM__S_DCACHE_INV::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SMEM__S_DCACHE_WB class methods ---
+
+    Inst_SMEM__S_DCACHE_WB::Inst_SMEM__S_DCACHE_WB(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_dcache_wb")
+    {
+    } // Inst_SMEM__S_DCACHE_WB
+
+    Inst_SMEM__S_DCACHE_WB::~Inst_SMEM__S_DCACHE_WB()
+    {
+    } // ~Inst_SMEM__S_DCACHE_WB
+
+    // --- description from .arch file ---
+    // Write back dirty data in the scalar data cache.
+    void
+    Inst_SMEM__S_DCACHE_WB::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SMEM__S_DCACHE_INV_VOL class methods ---
+
+    Inst_SMEM__S_DCACHE_INV_VOL::Inst_SMEM__S_DCACHE_INV_VOL(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_dcache_inv_vol")
+    {
+    } // Inst_SMEM__S_DCACHE_INV_VOL
+
+    Inst_SMEM__S_DCACHE_INV_VOL::~Inst_SMEM__S_DCACHE_INV_VOL()
+    {
+    } // ~Inst_SMEM__S_DCACHE_INV_VOL
+
+    // --- description from .arch file ---
+    // Invalidate the scalar data cache volatile lines.
+    void
+    Inst_SMEM__S_DCACHE_INV_VOL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SMEM__S_DCACHE_WB_VOL class methods ---
+
+    Inst_SMEM__S_DCACHE_WB_VOL::Inst_SMEM__S_DCACHE_WB_VOL(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_dcache_wb_vol")
+    {
+    } // Inst_SMEM__S_DCACHE_WB_VOL
+
+    Inst_SMEM__S_DCACHE_WB_VOL::~Inst_SMEM__S_DCACHE_WB_VOL()
+    {
+    } // ~Inst_SMEM__S_DCACHE_WB_VOL
+
+    // --- description from .arch file ---
+    // Write back dirty data in the scalar data cache volatile lines.
+    void
+    Inst_SMEM__S_DCACHE_WB_VOL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SMEM__S_MEMTIME class methods ---
+
+    Inst_SMEM__S_MEMTIME::Inst_SMEM__S_MEMTIME(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_memtime")
+    {
+        // s_memtime does not issue a memory request
+        setFlag(ALU);
+    } // Inst_SMEM__S_MEMTIME
+
+    Inst_SMEM__S_MEMTIME::~Inst_SMEM__S_MEMTIME()
+    {
+    } // ~Inst_SMEM__S_MEMTIME
+
+    // --- description from .arch file ---
+    // Return current 64-bit timestamp.
+    void
+    Inst_SMEM__S_MEMTIME::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDATA);
+        sdst = (ScalarRegU64)gpuDynInst->computeUnit()->curCycle();
+        sdst.write();
+    } // execute
+    // --- Inst_SMEM__S_MEMREALTIME class methods ---
+
+    Inst_SMEM__S_MEMREALTIME::Inst_SMEM__S_MEMREALTIME(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_memrealtime")
+    {
+    } // Inst_SMEM__S_MEMREALTIME
+
+    Inst_SMEM__S_MEMREALTIME::~Inst_SMEM__S_MEMREALTIME()
+    {
+    } // ~Inst_SMEM__S_MEMREALTIME
+
+    // --- description from .arch file ---
+    // Return current 64-bit RTC.
+    void
+    Inst_SMEM__S_MEMREALTIME::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SMEM__S_ATC_PROBE class methods ---
+
+    Inst_SMEM__S_ATC_PROBE::Inst_SMEM__S_ATC_PROBE(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_atc_probe")
+    {
+    } // Inst_SMEM__S_ATC_PROBE
+
+    Inst_SMEM__S_ATC_PROBE::~Inst_SMEM__S_ATC_PROBE()
+    {
+    } // ~Inst_SMEM__S_ATC_PROBE
+
+    // --- description from .arch file ---
+    // Probe or prefetch an address into the SQC data cache.
+    void
+    Inst_SMEM__S_ATC_PROBE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SMEM__S_ATC_PROBE_BUFFER class methods ---
+
+    Inst_SMEM__S_ATC_PROBE_BUFFER::Inst_SMEM__S_ATC_PROBE_BUFFER(
+          InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_atc_probe_buffer")
+    {
+    } // Inst_SMEM__S_ATC_PROBE_BUFFER
+
+    Inst_SMEM__S_ATC_PROBE_BUFFER::~Inst_SMEM__S_ATC_PROBE_BUFFER()
+    {
+    } // ~Inst_SMEM__S_ATC_PROBE_BUFFER
+
+    // --- description from .arch file ---
+    // Probe or prefetch an address into the SQC data cache.
+    void
+    Inst_SMEM__S_ATC_PROBE_BUFFER::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/sop1.cc b/src/arch/amdgpu/vega/insts/sop1.cc
new file mode 100644
index 0000000000..fa9a103e39
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/sop1.cc
@@ -0,0 +1,1504 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/inst_util.hh"
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_SOP1__S_MOV_B32 class methods ---
+
+    Inst_SOP1__S_MOV_B32::Inst_SOP1__S_MOV_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_mov_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_MOV_B32
+
+    Inst_SOP1__S_MOV_B32::~Inst_SOP1__S_MOV_B32()
+    {
+    } // ~Inst_SOP1__S_MOV_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u.
+    void
+    Inst_SOP1__S_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = src.rawData();
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_MOV_B64 class methods ---
+
+    Inst_SOP1__S_MOV_B64::Inst_SOP1__S_MOV_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_mov_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_MOV_B64
+
+    Inst_SOP1__S_MOV_B64::~Inst_SOP1__S_MOV_B64()
+    {
+    } // ~Inst_SOP1__S_MOV_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S0.u64.
+    void
+    Inst_SOP1__S_MOV_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = src.rawData();
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_CMOV_B32 class methods ---
+
+    Inst_SOP1__S_CMOV_B32::Inst_SOP1__S_CMOV_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_cmov_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_CMOV_B32
+
+    Inst_SOP1__S_CMOV_B32::~Inst_SOP1__S_CMOV_B32()
+    {
+    } // ~Inst_SOP1__S_CMOV_B32
+
+    // --- description from .arch file ---
+    // (SCC) then D.u = S0.u;
+    // else NOP.
+    // Conditional move.
+    void
+    Inst_SOP1__S_CMOV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+        scc.read();
+
+        if (scc.rawData()) {
+            sdst = src.rawData();
+            sdst.write();
+        }
+    } // execute
+    // --- Inst_SOP1__S_CMOV_B64 class methods ---
+
+    Inst_SOP1__S_CMOV_B64::Inst_SOP1__S_CMOV_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_cmov_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_CMOV_B64
+
+    Inst_SOP1__S_CMOV_B64::~Inst_SOP1__S_CMOV_B64()
+    {
+    } // ~Inst_SOP1__S_CMOV_B64
+
+    // --- description from .arch file ---
+    // if (SCC) then D.u64 = S0.u64;
+    // else NOP.
+    // Conditional move.
+    void
+    Inst_SOP1__S_CMOV_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+        scc.read();
+
+        if (scc.rawData()) {
+            sdst = src.rawData();
+            sdst.write();
+        }
+    } // execute
+    // --- Inst_SOP1__S_NOT_B32 class methods ---
+
+    Inst_SOP1__S_NOT_B32::Inst_SOP1__S_NOT_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_not_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_NOT_B32
+
+    Inst_SOP1__S_NOT_B32::~Inst_SOP1__S_NOT_B32()
+    {
+    } // ~Inst_SOP1__S_NOT_B32
+
+    // --- description from .arch file ---
+    // D.u = ~S0.u;
+    // SCC = 1 if result is non-zero.
+    // Bitwise negation.
+    void
+    Inst_SOP1__S_NOT_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = ~src.rawData();
+
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_NOT_B64 class methods ---
+
+    Inst_SOP1__S_NOT_B64::Inst_SOP1__S_NOT_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_not_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_NOT_B64
+
+    Inst_SOP1__S_NOT_B64::~Inst_SOP1__S_NOT_B64()
+    {
+    } // ~Inst_SOP1__S_NOT_B64
+
+    // --- description from .arch file ---
+    // D.u64 = ~S0.u64;
+    // SCC = 1 if result is non-zero.
+    // Bitwise negation.
+    void
+    Inst_SOP1__S_NOT_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = ~src.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_WQM_B32 class methods ---
+
+    Inst_SOP1__S_WQM_B32::Inst_SOP1__S_WQM_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_wqm_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_WQM_B32
+
+    Inst_SOP1__S_WQM_B32::~Inst_SOP1__S_WQM_B32()
+    {
+    } // ~Inst_SOP1__S_WQM_B32
+
+    // --- description from .arch file ---
+    // D[i] = (S0[(i & ~3):(i | 3)] != 0);
+    // Computes whole quad mode for an active/valid mask.
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP1__S_WQM_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wholeQuadMode(src.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_WQM_B64 class methods ---
+
+    Inst_SOP1__S_WQM_B64::Inst_SOP1__S_WQM_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_wqm_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_WQM_B64
+
+    Inst_SOP1__S_WQM_B64::~Inst_SOP1__S_WQM_B64()
+    {
+    } // ~Inst_SOP1__S_WQM_B64
+
+    // --- description from .arch file ---
+    // D[i] = (S0[(i & ~3):(i | 3)] != 0);
+    // Computes whole quad mode for an active/valid mask.
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP1__S_WQM_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wholeQuadMode(src.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_BREV_B32 class methods ---
+
+    Inst_SOP1__S_BREV_B32::Inst_SOP1__S_BREV_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_brev_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BREV_B32
+
+    Inst_SOP1__S_BREV_B32::~Inst_SOP1__S_BREV_B32()
+    {
+    } // ~Inst_SOP1__S_BREV_B32
+
+    // --- description from .arch file ---
+    // D.u[31:0] = S0.u[0:31] (reverse bits).
+    void
+    Inst_SOP1__S_BREV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = reverseBits(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_BREV_B64 class methods ---
+
+    Inst_SOP1__S_BREV_B64::Inst_SOP1__S_BREV_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_brev_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BREV_B64
+
+    Inst_SOP1__S_BREV_B64::~Inst_SOP1__S_BREV_B64()
+    {
+    } // ~Inst_SOP1__S_BREV_B64
+
+    // --- description from .arch file ---
+    // D.u64[63:0] = S0.u64[0:63] (reverse bits).
+    void
+    Inst_SOP1__S_BREV_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = reverseBits(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_BCNT0_I32_B32 class methods ---
+
+    Inst_SOP1__S_BCNT0_I32_B32::Inst_SOP1__S_BCNT0_I32_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_bcnt0_i32_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BCNT0_I32_B32
+
+    Inst_SOP1__S_BCNT0_I32_B32::~Inst_SOP1__S_BCNT0_I32_B32()
+    {
+    } // ~Inst_SOP1__S_BCNT0_I32_B32
+
+    // --- description from .arch file ---
+    // D.i = CountZeroBits(S0.u);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP1__S_BCNT0_I32_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = countZeroBits(src.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_BCNT0_I32_B64 class methods ---
+
+    Inst_SOP1__S_BCNT0_I32_B64::Inst_SOP1__S_BCNT0_I32_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_bcnt0_i32_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BCNT0_I32_B64
+
+    Inst_SOP1__S_BCNT0_I32_B64::~Inst_SOP1__S_BCNT0_I32_B64()
+    {
+    } // ~Inst_SOP1__S_BCNT0_I32_B64
+
+    // --- description from .arch file ---
+    // D.i = CountZeroBits(S0.u64);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP1__S_BCNT0_I32_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = countZeroBits(src.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_BCNT1_I32_B32 class methods ---
+
+    Inst_SOP1__S_BCNT1_I32_B32::Inst_SOP1__S_BCNT1_I32_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_bcnt1_i32_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BCNT1_I32_B32
+
+    Inst_SOP1__S_BCNT1_I32_B32::~Inst_SOP1__S_BCNT1_I32_B32()
+    {
+    } // ~Inst_SOP1__S_BCNT1_I32_B32
+
+    // --- description from .arch file ---
+    // D.i = CountOneBits(S0.u);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP1__S_BCNT1_I32_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = popCount(src.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_BCNT1_I32_B64 class methods ---
+
+    Inst_SOP1__S_BCNT1_I32_B64::Inst_SOP1__S_BCNT1_I32_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_bcnt1_i32_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BCNT1_I32_B64
+
+    Inst_SOP1__S_BCNT1_I32_B64::~Inst_SOP1__S_BCNT1_I32_B64()
+    {
+    } // ~Inst_SOP1__S_BCNT1_I32_B64
+
+    // --- description from .arch file ---
+    // D.i = CountOneBits(S0.u64);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP1__S_BCNT1_I32_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = popCount(src.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_FF0_I32_B32 class methods ---
+
+    Inst_SOP1__S_FF0_I32_B32::Inst_SOP1__S_FF0_I32_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_ff0_i32_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_FF0_I32_B32
+
+    Inst_SOP1__S_FF0_I32_B32::~Inst_SOP1__S_FF0_I32_B32()
+    {
+    } // ~Inst_SOP1__S_FF0_I32_B32
+
+    // --- description from .arch file ---
+    // D.i = FindFirstZero(S0.u);
+    // If no zeros are found, return -1.
+    // Returns the bit position of the first zero from the LSB.
+    void
+    Inst_SOP1__S_FF0_I32_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = findFirstZero(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_FF0_I32_B64 class methods ---
+
+    Inst_SOP1__S_FF0_I32_B64::Inst_SOP1__S_FF0_I32_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_ff0_i32_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_FF0_I32_B64
+
+    Inst_SOP1__S_FF0_I32_B64::~Inst_SOP1__S_FF0_I32_B64()
+    {
+    } // ~Inst_SOP1__S_FF0_I32_B64
+
+    // --- description from .arch file ---
+    // D.i = FindFirstZero(S0.u64);
+    // If no zeros are found, return -1.
+    // Returns the bit position of the first zero from the LSB.
+    void
+    Inst_SOP1__S_FF0_I32_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = findFirstZero(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_FF1_I32_B32 class methods ---
+
+    Inst_SOP1__S_FF1_I32_B32::Inst_SOP1__S_FF1_I32_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_ff1_i32_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_FF1_I32_B32
+
+    Inst_SOP1__S_FF1_I32_B32::~Inst_SOP1__S_FF1_I32_B32()
+    {
+    } // ~Inst_SOP1__S_FF1_I32_B32
+
+    // --- description from .arch file ---
+    // D.i = FindFirstOne(S0.u);
+    // If no ones are found, return -1.
+    // Returns the bit position of the first one from the LSB.
+    void
+    Inst_SOP1__S_FF1_I32_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = findFirstOne(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_FF1_I32_B64 class methods ---
+
+    Inst_SOP1__S_FF1_I32_B64::Inst_SOP1__S_FF1_I32_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_ff1_i32_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_FF1_I32_B64
+
+    Inst_SOP1__S_FF1_I32_B64::~Inst_SOP1__S_FF1_I32_B64()
+    {
+    } // ~Inst_SOP1__S_FF1_I32_B64
+
+    // --- description from .arch file ---
+    // D.i = FindFirstOne(S0.u64);
+    // If no ones are found, return -1.
+    // Returns the bit position of the first one from the LSB.
+    void
+    Inst_SOP1__S_FF1_I32_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = findFirstOne(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_FLBIT_I32_B32 class methods ---
+
+    Inst_SOP1__S_FLBIT_I32_B32::Inst_SOP1__S_FLBIT_I32_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_flbit_i32_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_FLBIT_I32_B32
+
+    Inst_SOP1__S_FLBIT_I32_B32::~Inst_SOP1__S_FLBIT_I32_B32()
+    {
+    } // ~Inst_SOP1__S_FLBIT_I32_B32
+
+    // --- description from .arch file ---
+    // D.i = FindFirstOne(S0.u);
+    // If no ones are found, return -1.
+    // Counts how many zeros before the first one starting from the MSB.
+    void
+    Inst_SOP1__S_FLBIT_I32_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = countZeroBitsMsb(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_FLBIT_I32_B64 class methods ---
+
+    Inst_SOP1__S_FLBIT_I32_B64::Inst_SOP1__S_FLBIT_I32_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_flbit_i32_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_FLBIT_I32_B64
+
+    Inst_SOP1__S_FLBIT_I32_B64::~Inst_SOP1__S_FLBIT_I32_B64()
+    {
+    } // ~Inst_SOP1__S_FLBIT_I32_B64
+
+    // --- description from .arch file ---
+    // D.i = FindFirstOne(S0.u64);
+    // If no ones are found, return -1.
+    // Counts how many zeros before the first one starting from the MSB.
+    void
+    Inst_SOP1__S_FLBIT_I32_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = countZeroBitsMsb(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_FLBIT_I32 class methods ---
+
+    Inst_SOP1__S_FLBIT_I32::Inst_SOP1__S_FLBIT_I32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_flbit_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_FLBIT_I32
+
+    Inst_SOP1__S_FLBIT_I32::~Inst_SOP1__S_FLBIT_I32()
+    {
+    } // ~Inst_SOP1__S_FLBIT_I32
+
+    // --- description from .arch file ---
+    // D.i = FirstOppositeSignBit(S0.i);
+    // If S0.i == 0 or S0.i == -1 (all bits are the same), return -1.
+    // Counts how many bits in a row (from MSB to LSB) are the same as the
+    // sign bit.
+    void
+    Inst_SOP1__S_FLBIT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = firstOppositeSignBit(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_FLBIT_I32_I64 class methods ---
+
+    Inst_SOP1__S_FLBIT_I32_I64::Inst_SOP1__S_FLBIT_I32_I64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_flbit_i32_i64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_FLBIT_I32_I64
+
+    Inst_SOP1__S_FLBIT_I32_I64::~Inst_SOP1__S_FLBIT_I32_I64()
+    {
+    } // ~Inst_SOP1__S_FLBIT_I32_I64
+
+    // --- description from .arch file ---
+    // D.i = FirstOppositeSignBit(S0.i64);
+    // If S0.i == 0 or S0.i == -1 (all bits are the same), return -1.
+    // Counts how many bits in a row (from MSB to LSB) are the same as the
+    // sign bit.
+    void
+    Inst_SOP1__S_FLBIT_I32_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = firstOppositeSignBit(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_SEXT_I32_I8 class methods ---
+
+    Inst_SOP1__S_SEXT_I32_I8::Inst_SOP1__S_SEXT_I32_I8(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_sext_i32_i8")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_SEXT_I32_I8
+
+    Inst_SOP1__S_SEXT_I32_I8::~Inst_SOP1__S_SEXT_I32_I8()
+    {
+    } // ~Inst_SOP1__S_SEXT_I32_I8
+
+    // --- description from .arch file ---
+    // D.i = signext(S0.i[7:0]) (sign extension).
+    void
+    Inst_SOP1__S_SEXT_I32_I8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = sext<std::numeric_limits<ScalarRegI8>::digits>(
+            bits(src.rawData(), 7, 0));
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_SEXT_I32_I16 class methods ---
+
+    Inst_SOP1__S_SEXT_I32_I16::Inst_SOP1__S_SEXT_I32_I16(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_sext_i32_i16")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_SEXT_I32_I16
+
+    Inst_SOP1__S_SEXT_I32_I16::~Inst_SOP1__S_SEXT_I32_I16()
+    {
+    } // ~Inst_SOP1__S_SEXT_I32_I16
+
+    // --- description from .arch file ---
+    // D.i = signext(S0.i[15:0]) (sign extension).
+    void
+    Inst_SOP1__S_SEXT_I32_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = sext<std::numeric_limits<ScalarRegI16>::digits>(
+            bits(src.rawData(), 15, 0));
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_BITSET0_B32 class methods ---
+
+    Inst_SOP1__S_BITSET0_B32::Inst_SOP1__S_BITSET0_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_bitset0_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BITSET0_B32
+
+    Inst_SOP1__S_BITSET0_B32::~Inst_SOP1__S_BITSET0_B32()
+    {
+    } // ~Inst_SOP1__S_BITSET0_B32
+
+    // --- description from .arch file ---
+    // D.u[S0.u[4:0]] = 0.
+    void
+    Inst_SOP1__S_BITSET0_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst.setBit(bits(src.rawData(), 4, 0), 0);
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_BITSET0_B64 class methods ---
+
+    Inst_SOP1__S_BITSET0_B64::Inst_SOP1__S_BITSET0_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_bitset0_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BITSET0_B64
+
+    Inst_SOP1__S_BITSET0_B64::~Inst_SOP1__S_BITSET0_B64()
+    {
+    } // ~Inst_SOP1__S_BITSET0_B64
+
+    // --- description from .arch file ---
+    // D.u64[S0.u[5:0]] = 0.
+    void
+    Inst_SOP1__S_BITSET0_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst.setBit(bits(src.rawData(), 5, 0), 0);
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_BITSET1_B32 class methods ---
+
+    Inst_SOP1__S_BITSET1_B32::Inst_SOP1__S_BITSET1_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_bitset1_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BITSET1_B32
+
+    Inst_SOP1__S_BITSET1_B32::~Inst_SOP1__S_BITSET1_B32()
+    {
+    } // ~Inst_SOP1__S_BITSET1_B32
+
+    // --- description from .arch file ---
+    // D.u[S0.u[4:0]] = 1.
+    void
+    Inst_SOP1__S_BITSET1_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst.setBit(bits(src.rawData(), 4, 0), 1);
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_BITSET1_B64 class methods ---
+
+    Inst_SOP1__S_BITSET1_B64::Inst_SOP1__S_BITSET1_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_bitset1_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BITSET1_B64
+
+    Inst_SOP1__S_BITSET1_B64::~Inst_SOP1__S_BITSET1_B64()
+    {
+    } // ~Inst_SOP1__S_BITSET1_B64
+
+    // --- description from .arch file ---
+    // D.u64[S0.u[5:0]] = 1.
+    void
+    Inst_SOP1__S_BITSET1_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst.setBit(bits(src.rawData(), 5, 0), 1);
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_GETPC_B64 class methods ---
+
+    Inst_SOP1__S_GETPC_B64::Inst_SOP1__S_GETPC_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_getpc_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_GETPC_B64
+
+    Inst_SOP1__S_GETPC_B64::~Inst_SOP1__S_GETPC_B64()
+    {
+    } // ~Inst_SOP1__S_GETPC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = PC + 4.
+    // Destination receives the byte address of the next instruction.
+    void
+    Inst_SOP1__S_GETPC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Addr pc = gpuDynInst->pc();
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+
+        sdst = pc + 4;
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_SETPC_B64 class methods ---
+
+    Inst_SOP1__S_SETPC_B64::Inst_SOP1__S_SETPC_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_setpc_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_SETPC_B64
+
+    Inst_SOP1__S_SETPC_B64::~Inst_SOP1__S_SETPC_B64()
+    {
+    } // ~Inst_SOP1__S_SETPC_B64
+
+    // --- description from .arch file ---
+    // PC = S0.u64.
+    // S0.u64 is a byte address of the instruction to jump to.
+    void
+    Inst_SOP1__S_SETPC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+
+        src.read();
+
+        wf->pc(src.rawData());
+    } // execute
+    // --- Inst_SOP1__S_SWAPPC_B64 class methods ---
+
+    Inst_SOP1__S_SWAPPC_B64::Inst_SOP1__S_SWAPPC_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_swappc_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_SWAPPC_B64
+
+    Inst_SOP1__S_SWAPPC_B64::~Inst_SOP1__S_SWAPPC_B64()
+    {
+    } // ~Inst_SOP1__S_SWAPPC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = PC + 4; PC = S0.u64.
+    // S0.u64 is a byte address of the instruction to jump to.
+    void
+    Inst_SOP1__S_SWAPPC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        Addr pc = gpuDynInst->pc();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = pc + 4;
+
+        wf->pc(src.rawData());
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_RFE_B64 class methods ---
+
+    Inst_SOP1__S_RFE_B64::Inst_SOP1__S_RFE_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_rfe_b64")
+    {
+    } // Inst_SOP1__S_RFE_B64
+
+    Inst_SOP1__S_RFE_B64::~Inst_SOP1__S_RFE_B64()
+    {
+    } // ~Inst_SOP1__S_RFE_B64
+
+    // --- description from .arch file ---
+    // PRIV = 0;
+    // PC = S0.u64.
+    // Return from exception handler and continue.
+    // This instruction may only be used within a trap handler.
+    void
+    Inst_SOP1__S_RFE_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOP1__S_AND_SAVEEXEC_B64 class methods ---
+
+    Inst_SOP1__S_AND_SAVEEXEC_B64::Inst_SOP1__S_AND_SAVEEXEC_B64(
+          InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_and_saveexec_b64")
+    {
+        setFlag(ALU);
+        setFlag(ReadsEXEC);
+        setFlag(WritesEXEC);
+    } // Inst_SOP1__S_AND_SAVEEXEC_B64
+
+    Inst_SOP1__S_AND_SAVEEXEC_B64::~Inst_SOP1__S_AND_SAVEEXEC_B64()
+    {
+    } // ~Inst_SOP1__S_AND_SAVEEXEC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = EXEC;
+    // EXEC = S0.u64 & EXEC;
+    // SCC = 1 if the new value of EXEC is non-zero.
+    void
+    Inst_SOP1__S_AND_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wf->execMask().to_ullong();
+        wf->execMask() = src.rawData() & wf->execMask().to_ullong();
+        scc = wf->execMask().any() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_OR_SAVEEXEC_B64 class methods ---
+
+    Inst_SOP1__S_OR_SAVEEXEC_B64::Inst_SOP1__S_OR_SAVEEXEC_B64(
+          InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_or_saveexec_b64")
+    {
+        setFlag(ALU);
+        setFlag(ReadsEXEC);
+        setFlag(WritesEXEC);
+    } // Inst_SOP1__S_OR_SAVEEXEC_B64
+
+    Inst_SOP1__S_OR_SAVEEXEC_B64::~Inst_SOP1__S_OR_SAVEEXEC_B64()
+    {
+    } // ~Inst_SOP1__S_OR_SAVEEXEC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = EXEC;
+    // EXEC = S0.u64 | EXEC;
+    // SCC = 1 if the new value of EXEC is non-zero.
+    void
+    Inst_SOP1__S_OR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wf->execMask().to_ullong();
+        wf->execMask() = src.rawData() | wf->execMask().to_ullong();
+        scc = wf->execMask().any() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_XOR_SAVEEXEC_B64 class methods ---
+
+    Inst_SOP1__S_XOR_SAVEEXEC_B64::Inst_SOP1__S_XOR_SAVEEXEC_B64(
+          InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_xor_saveexec_b64")
+    {
+        setFlag(ALU);
+        setFlag(ReadsEXEC);
+        setFlag(WritesEXEC);
+    } // Inst_SOP1__S_XOR_SAVEEXEC_B64
+
+    Inst_SOP1__S_XOR_SAVEEXEC_B64::~Inst_SOP1__S_XOR_SAVEEXEC_B64()
+    {
+    } // ~Inst_SOP1__S_XOR_SAVEEXEC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = EXEC;
+    // EXEC = S0.u64 ^ EXEC;
+    // SCC = 1 if the new value of EXEC is non-zero.
+    void
+    Inst_SOP1__S_XOR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wf->execMask().to_ullong();
+        wf->execMask() = src.rawData() ^ wf->execMask().to_ullong();
+        scc = wf->execMask().any() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_ANDN2_SAVEEXEC_B64 class methods ---
+
+    Inst_SOP1__S_ANDN2_SAVEEXEC_B64::Inst_SOP1__S_ANDN2_SAVEEXEC_B64(
+          InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_andn2_saveexec_b64")
+    {
+        setFlag(ALU);
+        setFlag(ReadsEXEC);
+        setFlag(WritesEXEC);
+    } // Inst_SOP1__S_ANDN2_SAVEEXEC_B64
+
+    Inst_SOP1__S_ANDN2_SAVEEXEC_B64::~Inst_SOP1__S_ANDN2_SAVEEXEC_B64()
+    {
+    } // ~Inst_SOP1__S_ANDN2_SAVEEXEC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = EXEC;
+    // EXEC = S0.u64 & ~EXEC;
+    // SCC = 1 if the new value of EXEC is non-zero.
+    void
+    Inst_SOP1__S_ANDN2_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wf->execMask().to_ullong();
+        wf->execMask() = src.rawData() &~ wf->execMask().to_ullong();
+        scc = wf->execMask().any() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_ORN2_SAVEEXEC_B64 class methods ---
+
+    Inst_SOP1__S_ORN2_SAVEEXEC_B64::Inst_SOP1__S_ORN2_SAVEEXEC_B64(
+          InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_orn2_saveexec_b64")
+    {
+        setFlag(ALU);
+        setFlag(ReadsEXEC);
+        setFlag(WritesEXEC);
+    } // Inst_SOP1__S_ORN2_SAVEEXEC_B64
+
+    Inst_SOP1__S_ORN2_SAVEEXEC_B64::~Inst_SOP1__S_ORN2_SAVEEXEC_B64()
+    {
+    } // ~Inst_SOP1__S_ORN2_SAVEEXEC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = EXEC;
+    // EXEC = S0.u64 | ~EXEC;
+    // SCC = 1 if the new value of EXEC is non-zero.
+    void
+    Inst_SOP1__S_ORN2_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wf->execMask().to_ullong();
+        wf->execMask() = src.rawData() |~ wf->execMask().to_ullong();
+        scc = wf->execMask().any() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_NAND_SAVEEXEC_B64 class methods ---
+
+    Inst_SOP1__S_NAND_SAVEEXEC_B64::Inst_SOP1__S_NAND_SAVEEXEC_B64(
+          InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_nand_saveexec_b64")
+    {
+        setFlag(ALU);
+        setFlag(ReadsEXEC);
+        setFlag(WritesEXEC);
+    } // Inst_SOP1__S_NAND_SAVEEXEC_B64
+
+    Inst_SOP1__S_NAND_SAVEEXEC_B64::~Inst_SOP1__S_NAND_SAVEEXEC_B64()
+    {
+    } // ~Inst_SOP1__S_NAND_SAVEEXEC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = EXEC;
+    // EXEC = ~(S0.u64 & EXEC);
+    // SCC = 1 if the new value of EXEC is non-zero.
+    void
+    Inst_SOP1__S_NAND_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wf->execMask().to_ullong();
+        wf->execMask() = ~(src.rawData() & wf->execMask().to_ullong());
+        scc = wf->execMask().any() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_NOR_SAVEEXEC_B64 class methods ---
+
+    Inst_SOP1__S_NOR_SAVEEXEC_B64::Inst_SOP1__S_NOR_SAVEEXEC_B64(
+          InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_nor_saveexec_b64")
+    {
+        setFlag(ALU);
+        setFlag(ReadsEXEC);
+        setFlag(WritesEXEC);
+    } // Inst_SOP1__S_NOR_SAVEEXEC_B64
+
+    Inst_SOP1__S_NOR_SAVEEXEC_B64::~Inst_SOP1__S_NOR_SAVEEXEC_B64()
+    {
+    } // ~Inst_SOP1__S_NOR_SAVEEXEC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = EXEC;
+    // EXEC = ~(S0.u64 | EXEC);
+    // SCC = 1 if the new value of EXEC is non-zero.
+    void
+    Inst_SOP1__S_NOR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wf->execMask().to_ullong();
+        wf->execMask() = ~(src.rawData() | wf->execMask().to_ullong());
+        scc = wf->execMask().any() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_XNOR_SAVEEXEC_B64 class methods ---
+
+    Inst_SOP1__S_XNOR_SAVEEXEC_B64::Inst_SOP1__S_XNOR_SAVEEXEC_B64(
+          InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_xnor_saveexec_b64")
+    {
+        setFlag(ALU);
+        setFlag(ReadsEXEC);
+        setFlag(WritesEXEC);
+    } // Inst_SOP1__S_XNOR_SAVEEXEC_B64
+
+    Inst_SOP1__S_XNOR_SAVEEXEC_B64::~Inst_SOP1__S_XNOR_SAVEEXEC_B64()
+    {
+    } // ~Inst_SOP1__S_XNOR_SAVEEXEC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = EXEC;
+    // EXEC = ~(S0.u64 ^ EXEC);
+    // SCC = 1 if the new value of EXEC is non-zero.
+    void
+    Inst_SOP1__S_XNOR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wf->execMask().to_ullong();
+        wf->execMask() = ~(src.rawData() ^ wf->execMask().to_ullong());
+        scc = wf->execMask().any() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_QUADMASK_B32 class methods ---
+
+    Inst_SOP1__S_QUADMASK_B32::Inst_SOP1__S_QUADMASK_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_quadmask_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_QUADMASK_B32
+
+    Inst_SOP1__S_QUADMASK_B32::~Inst_SOP1__S_QUADMASK_B32()
+    {
+    } // ~Inst_SOP1__S_QUADMASK_B32
+
+    // --- description from .arch file ---
+    // D.u = QuadMask(S0.u):
+    // D[0] = OR(S0[3:0]), D[1] = OR(S0[7:4]) ... D[31:8] = 0;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP1__S_QUADMASK_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = quadMask(src.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_QUADMASK_B64 class methods ---
+
+    Inst_SOP1__S_QUADMASK_B64::Inst_SOP1__S_QUADMASK_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_quadmask_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_QUADMASK_B64
+
+    Inst_SOP1__S_QUADMASK_B64::~Inst_SOP1__S_QUADMASK_B64()
+    {
+    } // ~Inst_SOP1__S_QUADMASK_B64
+
+    // --- description from .arch file ---
+    // D.u64 = QuadMask(S0.u64):
+    // D[0] = OR(S0[3:0]), D[1] = OR(S0[7:4]) ... D[63:16] = 0;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP1__S_QUADMASK_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = quadMask(src.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_MOVRELS_B32 class methods ---
+
+    Inst_SOP1__S_MOVRELS_B32::Inst_SOP1__S_MOVRELS_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_movrels_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_MOVRELS_B32
+
+    Inst_SOP1__S_MOVRELS_B32::~Inst_SOP1__S_MOVRELS_B32()
+    {
+    } // ~Inst_SOP1__S_MOVRELS_B32
+
+    // --- description from .arch file ---
+    // D.u = SGPR[S0.u + M0.u].u (move from relative source).
+    void
+    Inst_SOP1__S_MOVRELS_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
+        m0.read();
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0 + m0.rawData());
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = src.rawData();
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_MOVRELS_B64 class methods ---
+
+    Inst_SOP1__S_MOVRELS_B64::Inst_SOP1__S_MOVRELS_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_movrels_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_MOVRELS_B64
+
+    Inst_SOP1__S_MOVRELS_B64::~Inst_SOP1__S_MOVRELS_B64()
+    {
+    } // ~Inst_SOP1__S_MOVRELS_B64
+
+    // --- description from .arch file ---
+    // D.u64 = SGPR[S0.u + M0.u].u64 (move from relative source).
+    // The index in M0.u must be even for this operation.
+    void
+    Inst_SOP1__S_MOVRELS_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
+        m0.read();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0 + m0.rawData());
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = src.rawData();
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_MOVRELD_B32 class methods ---
+
+    Inst_SOP1__S_MOVRELD_B32::Inst_SOP1__S_MOVRELD_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_movreld_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_MOVRELD_B32
+
+    Inst_SOP1__S_MOVRELD_B32::~Inst_SOP1__S_MOVRELD_B32()
+    {
+    } // ~Inst_SOP1__S_MOVRELD_B32
+
+    // --- description from .arch file ---
+    // SGPR[D.u + M0.u].u = S0.u (move to relative destination).
+    void
+    Inst_SOP1__S_MOVRELD_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
+        m0.read();
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST + m0.rawData());
+
+        src.read();
+
+        sdst = src.rawData();
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_MOVRELD_B64 class methods ---
+
+    Inst_SOP1__S_MOVRELD_B64::Inst_SOP1__S_MOVRELD_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_movreld_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_MOVRELD_B64
+
+    Inst_SOP1__S_MOVRELD_B64::~Inst_SOP1__S_MOVRELD_B64()
+    {
+    } // ~Inst_SOP1__S_MOVRELD_B64
+
+    // --- description from .arch file ---
+    // SGPR[D.u + M0.u].u64 = S0.u64 (move to relative destination).
+    // The index in M0.u must be even for this operation.
+    void
+    Inst_SOP1__S_MOVRELD_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
+        m0.read();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST + m0.rawData());
+
+        src.read();
+
+        sdst = src.rawData();
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_CBRANCH_JOIN class methods ---
+
+    Inst_SOP1__S_CBRANCH_JOIN::Inst_SOP1__S_CBRANCH_JOIN(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_cbranch_join")
+    {
+        setFlag(Branch);
+        setFlag(WritesEXEC);
+    } // Inst_SOP1__S_CBRANCH_JOIN
+
+    Inst_SOP1__S_CBRANCH_JOIN::~Inst_SOP1__S_CBRANCH_JOIN()
+    {
+    } // ~Inst_SOP1__S_CBRANCH_JOIN
+
+    // --- description from .arch file ---
+    // saved_csp = S0.u;
+    // if (CSP == saved_csp) then
+    //     PC += 4; // Second time to JOIN: continue with program.
+    // else
+    //     CSP -= 1; // First time to JOIN; jump to other FORK path.
+    //     {PC, EXEC} = SGPR[CSP * 4]; // Read 128 bits from 4 consecutive
+    //     SGPRs.
+    // end
+    // Conditional branch join point (end of conditional branch block). S0 is
+    // saved CSP value.
+    // See S_CBRANCH_G_FORK and S_CBRANCH_I_FORK for related instructions.
+    void
+    Inst_SOP1__S_CBRANCH_JOIN::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOP1__S_ABS_I32 class methods ---
+
+    Inst_SOP1__S_ABS_I32::Inst_SOP1__S_ABS_I32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_abs_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_ABS_I32
+
+    Inst_SOP1__S_ABS_I32::~Inst_SOP1__S_ABS_I32()
+    {
+    } // ~Inst_SOP1__S_ABS_I32
+
+    // --- description from .arch file ---
+    // if (S.i < 0) then D.i = -S.i;
+    // else D.i = S.i;
+    // SCC = 1 if result is non-zero.
+    // Integer absolute value.
+    void
+    Inst_SOP1__S_ABS_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = std::abs(src.rawData());
+
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_MOV_FED_B32 class methods ---
+
+    Inst_SOP1__S_MOV_FED_B32::Inst_SOP1__S_MOV_FED_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_mov_fed_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_MOV_FED_B32
+
+    Inst_SOP1__S_MOV_FED_B32::~Inst_SOP1__S_MOV_FED_B32()
+    {
+    } // ~Inst_SOP1__S_MOV_FED_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u. Introduce an EDC double-detect error on write to the
+    // destination SGPR.
+    void
+    Inst_SOP1__S_MOV_FED_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOP1__S_SET_GPR_IDX_IDX class methods ---
+
+    Inst_SOP1__S_SET_GPR_IDX_IDX::Inst_SOP1__S_SET_GPR_IDX_IDX(
+          InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_set_gpr_idx_idx")
+    {
+    } // Inst_SOP1__S_SET_GPR_IDX_IDX
+
+    Inst_SOP1__S_SET_GPR_IDX_IDX::~Inst_SOP1__S_SET_GPR_IDX_IDX()
+    {
+    } // ~Inst_SOP1__S_SET_GPR_IDX_IDX
+
+    // --- description from .arch file ---
+    // M0[7:0] = S0.u[7:0].
+    // Modify the index used in vector GPR indexing.
+    void
+    Inst_SOP1__S_SET_GPR_IDX_IDX::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/sop2.cc b/src/arch/amdgpu/vega/insts/sop2.cc
new file mode 100644
index 0000000000..93618b2124
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/sop2.cc
@@ -0,0 +1,1555 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_SOP2__S_ADD_U32 class methods ---
+
+    Inst_SOP2__S_ADD_U32::Inst_SOP2__S_ADD_U32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_add_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ADD_U32
+
+    Inst_SOP2__S_ADD_U32::~Inst_SOP2__S_ADD_U32()
+    {
+    } // ~Inst_SOP2__S_ADD_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u + S1.u;
+    // SCC = (S0.u + S1.u >= 0x800000000ULL ? 1 : 0) is an unsigned
+    // ---  overflow/carry-out for S_ADDC_U32.
+    void
+    Inst_SOP2__S_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() + src1.rawData();
+        scc = ((ScalarRegU64)src0.rawData() + (ScalarRegU64)src1.rawData())
+            >= 0x100000000ULL ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_SUB_U32 class methods ---
+
+    Inst_SOP2__S_SUB_U32::Inst_SOP2__S_SUB_U32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_sub_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_SUB_U32
+
+    Inst_SOP2__S_SUB_U32::~Inst_SOP2__S_SUB_U32()
+    {
+    } // ~Inst_SOP2__S_SUB_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u - S1.u;
+    // SCC = (S1.u > S0.u ? 1 : 0) is an unsigned overflow or carry-out for
+    // ---  S_SUBB_U32.
+    void
+    Inst_SOP2__S_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() - src1.rawData();
+        scc = (src1.rawData() > src0.rawData()) ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_ADD_I32 class methods ---
+
+    Inst_SOP2__S_ADD_I32::Inst_SOP2__S_ADD_I32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_add_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ADD_I32
+
+    Inst_SOP2__S_ADD_I32::~Inst_SOP2__S_ADD_I32()
+    {
+    } // ~Inst_SOP2__S_ADD_I32
+
+    // --- description from .arch file ---
+    // D.i = S0.i + S1.i;
+    // SCC = (S0.u[31] == S1.u[31] && S0.u[31] != D.u[31]) is a signed
+    // overflow.
+    // This opcode is not suitable for use with S_ADDC_U32 for implementing
+    // 64-bit operations.
+    void
+    Inst_SOP2__S_ADD_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() + src1.rawData();
+        scc = (bits(src0.rawData(), 31) == bits(src1.rawData(), 31)
+            && bits(src0.rawData(), 31) != bits(sdst.rawData(), 31))
+            ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_SUB_I32 class methods ---
+
+    Inst_SOP2__S_SUB_I32::Inst_SOP2__S_SUB_I32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_sub_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_SUB_I32
+
+    Inst_SOP2__S_SUB_I32::~Inst_SOP2__S_SUB_I32()
+    {
+    } // ~Inst_SOP2__S_SUB_I32
+
+    // --- description from .arch file ---
+    // D.i = S0.i - S1.i;
+    // SCC = (S0.u[31] != S1.u[31] && S0.u[31] != D.u[31]) is a signed
+    // overflow.
+    // CAUTION: The condition code behaviour for this opcode is inconsistent
+    // with V_SUB_I32; see V_SUB_I32 for further details.
+    // This opcode is not suitable for use with S_SUBB_U32 for implementing
+    // 64-bit operations.
+    void
+    Inst_SOP2__S_SUB_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() - src1.rawData();
+        scc = (bits(src0.rawData(), 31) != bits(src1.rawData(), 31)
+            && bits(src0.rawData(), 31) != bits(sdst.rawData(), 31)) ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_ADDC_U32 class methods ---
+
+    Inst_SOP2__S_ADDC_U32::Inst_SOP2__S_ADDC_U32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_addc_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ADDC_U32
+
+    Inst_SOP2__S_ADDC_U32::~Inst_SOP2__S_ADDC_U32()
+    {
+    } // ~Inst_SOP2__S_ADDC_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u + S1.u + SCC;
+    // SCC = (S0.u + S1.u + SCC >= 0x800000000ULL ? 1 : 0) is an unsigned
+    // overflow.
+    void
+    Inst_SOP2__S_ADDC_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+        scc.read();
+
+        sdst = src0.rawData() + src1.rawData() + scc.rawData();
+        scc = ((ScalarRegU64)src0.rawData() + (ScalarRegU64)src1.rawData()
+            + (ScalarRegU64)scc.rawData()) >= 0x100000000ULL ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_SUBB_U32 class methods ---
+
+    Inst_SOP2__S_SUBB_U32::Inst_SOP2__S_SUBB_U32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_subb_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_SUBB_U32
+
+    Inst_SOP2__S_SUBB_U32::~Inst_SOP2__S_SUBB_U32()
+    {
+    } // ~Inst_SOP2__S_SUBB_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u - S1.u - SCC;
+    // SCC = (S1.u + SCC > S0.u ? 1 : 0) is an unsigned overflow.
+    void
+    Inst_SOP2__S_SUBB_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+        scc.read();
+
+        sdst = src0.rawData() - src1.rawData() - scc.rawData();
+        scc = (src1.rawData() + scc.rawData()) > src0.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_MIN_I32 class methods ---
+
+    Inst_SOP2__S_MIN_I32::Inst_SOP2__S_MIN_I32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_min_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_MIN_I32
+
+    Inst_SOP2__S_MIN_I32::~Inst_SOP2__S_MIN_I32()
+    {
+    } // ~Inst_SOP2__S_MIN_I32
+
+    // --- description from .arch file ---
+    // D.i = (S0.i < S1.i) ? S0.i : S1.i;
+    // SCC = 1 if S0 is chosen as the minimum value.
+    void
+    Inst_SOP2__S_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = std::min(src0.rawData(), src1.rawData());
+        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_MIN_U32 class methods ---
+
+    Inst_SOP2__S_MIN_U32::Inst_SOP2__S_MIN_U32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_min_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_MIN_U32
+
+    Inst_SOP2__S_MIN_U32::~Inst_SOP2__S_MIN_U32()
+    {
+    } // ~Inst_SOP2__S_MIN_U32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u < S1.u) ? S0.u : S1.u;
+    // SCC = 1 if S0 is chosen as the minimum value.
+    void
+    Inst_SOP2__S_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = std::min(src0.rawData(), src1.rawData());
+        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_MAX_I32 class methods ---
+
+    Inst_SOP2__S_MAX_I32::Inst_SOP2__S_MAX_I32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_max_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_MAX_I32
+
+    Inst_SOP2__S_MAX_I32::~Inst_SOP2__S_MAX_I32()
+    {
+    } // ~Inst_SOP2__S_MAX_I32
+
+    // --- description from .arch file ---
+    // D.i = (S0.i > S1.i) ? S0.i : S1.i;
+    // SCC = 1 if S0 is chosen as the maximum value.
+    void
+    Inst_SOP2__S_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = std::max(src0.rawData(), src1.rawData());
+        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_MAX_U32 class methods ---
+
+    Inst_SOP2__S_MAX_U32::Inst_SOP2__S_MAX_U32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_max_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_MAX_U32
+
+    Inst_SOP2__S_MAX_U32::~Inst_SOP2__S_MAX_U32()
+    {
+    } // ~Inst_SOP2__S_MAX_U32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u > S1.u) ? S0.u : S1.u;
+    // SCC = 1 if S0 is chosen as the maximum value.
+    void
+    Inst_SOP2__S_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = std::max(src0.rawData(), src1.rawData());
+        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_CSELECT_B32 class methods ---
+
+    Inst_SOP2__S_CSELECT_B32::Inst_SOP2__S_CSELECT_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_cselect_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_CSELECT_B32
+
+    Inst_SOP2__S_CSELECT_B32::~Inst_SOP2__S_CSELECT_B32()
+    {
+    } // ~Inst_SOP2__S_CSELECT_B32
+
+    // --- description from .arch file ---
+    // D.u = SCC ? S0.u : S1.u (conditional select).
+    void
+    Inst_SOP2__S_CSELECT_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+        scc.read();
+
+        sdst = scc.rawData() ? src0.rawData() : src1.rawData();
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP2__S_CSELECT_B64 class methods ---
+
+    Inst_SOP2__S_CSELECT_B64::Inst_SOP2__S_CSELECT_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_cselect_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_CSELECT_B64
+
+    Inst_SOP2__S_CSELECT_B64::~Inst_SOP2__S_CSELECT_B64()
+    {
+    } // ~Inst_SOP2__S_CSELECT_B64
+
+    // --- description from .arch file ---
+    // D.u64 = SCC ? S0.u64 : S1.u64 (conditional select).
+    void
+    Inst_SOP2__S_CSELECT_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+        scc.read();
+
+        sdst = scc.rawData() ? src0.rawData() : src1.rawData();
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP2__S_AND_B32 class methods ---
+
+    Inst_SOP2__S_AND_B32::Inst_SOP2__S_AND_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_and_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_AND_B32
+
+    Inst_SOP2__S_AND_B32::~Inst_SOP2__S_AND_B32()
+    {
+    } // ~Inst_SOP2__S_AND_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u & S1.u;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_AND_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() & src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_AND_B64 class methods ---
+
+    Inst_SOP2__S_AND_B64::Inst_SOP2__S_AND_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_and_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_AND_B64
+
+    Inst_SOP2__S_AND_B64::~Inst_SOP2__S_AND_B64()
+    {
+    } // ~Inst_SOP2__S_AND_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S0.u64 & S1.u64;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_AND_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() & src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_OR_B32 class methods ---
+
+    Inst_SOP2__S_OR_B32::Inst_SOP2__S_OR_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_or_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_OR_B32
+
+    Inst_SOP2__S_OR_B32::~Inst_SOP2__S_OR_B32()
+    {
+    } // ~Inst_SOP2__S_OR_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u | S1.u;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_OR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() | src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_OR_B64 class methods ---
+
+    Inst_SOP2__S_OR_B64::Inst_SOP2__S_OR_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_or_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_OR_B64
+
+    Inst_SOP2__S_OR_B64::~Inst_SOP2__S_OR_B64()
+    {
+    } // ~Inst_SOP2__S_OR_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S0.u64 | S1.u64;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_OR_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() | src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_XOR_B32 class methods ---
+
+    Inst_SOP2__S_XOR_B32::Inst_SOP2__S_XOR_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_xor_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_XOR_B32
+
+    Inst_SOP2__S_XOR_B32::~Inst_SOP2__S_XOR_B32()
+    {
+    } // ~Inst_SOP2__S_XOR_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u ^ S1.u;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() ^ src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_XOR_B64 class methods ---
+
+    Inst_SOP2__S_XOR_B64::Inst_SOP2__S_XOR_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_xor_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_XOR_B64
+
+    Inst_SOP2__S_XOR_B64::~Inst_SOP2__S_XOR_B64()
+    {
+    } // ~Inst_SOP2__S_XOR_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S0.u64 ^ S1.u64;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_XOR_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() ^ src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_ANDN2_B32 class methods ---
+
+    Inst_SOP2__S_ANDN2_B32::Inst_SOP2__S_ANDN2_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_andn2_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ANDN2_B32
+
+    Inst_SOP2__S_ANDN2_B32::~Inst_SOP2__S_ANDN2_B32()
+    {
+    } // ~Inst_SOP2__S_ANDN2_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u & ~S1.u;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_ANDN2_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() &~ src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_ANDN2_B64 class methods ---
+
+    Inst_SOP2__S_ANDN2_B64::Inst_SOP2__S_ANDN2_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_andn2_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ANDN2_B64
+
+    Inst_SOP2__S_ANDN2_B64::~Inst_SOP2__S_ANDN2_B64()
+    {
+    } // ~Inst_SOP2__S_ANDN2_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S0.u64 & ~S1.u64;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_ANDN2_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() &~ src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_ORN2_B32 class methods ---
+
+    Inst_SOP2__S_ORN2_B32::Inst_SOP2__S_ORN2_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_orn2_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ORN2_B32
+
+    Inst_SOP2__S_ORN2_B32::~Inst_SOP2__S_ORN2_B32()
+    {
+    } // ~Inst_SOP2__S_ORN2_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u | ~S1.u;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_ORN2_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() |~ src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_ORN2_B64 class methods ---
+
+    Inst_SOP2__S_ORN2_B64::Inst_SOP2__S_ORN2_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_orn2_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ORN2_B64
+
+    Inst_SOP2__S_ORN2_B64::~Inst_SOP2__S_ORN2_B64()
+    {
+    } // ~Inst_SOP2__S_ORN2_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S0.u64 | ~S1.u64;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_ORN2_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() |~ src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_NAND_B32 class methods ---
+
+    Inst_SOP2__S_NAND_B32::Inst_SOP2__S_NAND_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_nand_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_NAND_B32
+
+    Inst_SOP2__S_NAND_B32::~Inst_SOP2__S_NAND_B32()
+    {
+    } // ~Inst_SOP2__S_NAND_B32
+
+    // --- description from .arch file ---
+    // D.u = ~(S0.u & S1.u);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_NAND_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = ~(src0.rawData() & src1.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_NAND_B64 class methods ---
+
+    Inst_SOP2__S_NAND_B64::Inst_SOP2__S_NAND_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_nand_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_NAND_B64
+
+    Inst_SOP2__S_NAND_B64::~Inst_SOP2__S_NAND_B64()
+    {
+    } // ~Inst_SOP2__S_NAND_B64
+
+    // --- description from .arch file ---
+    // D.u64 = ~(S0.u64 & S1.u64);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_NAND_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = ~(src0.rawData() & src1.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_NOR_B32 class methods ---
+
+    Inst_SOP2__S_NOR_B32::Inst_SOP2__S_NOR_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_nor_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_NOR_B32
+
+    Inst_SOP2__S_NOR_B32::~Inst_SOP2__S_NOR_B32()
+    {
+    } // ~Inst_SOP2__S_NOR_B32
+
+    // --- description from .arch file ---
+    // D.u = ~(S0.u | S1.u);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_NOR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = ~(src0.rawData() | src1.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_NOR_B64 class methods ---
+
+    Inst_SOP2__S_NOR_B64::Inst_SOP2__S_NOR_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_nor_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_NOR_B64
+
+    Inst_SOP2__S_NOR_B64::~Inst_SOP2__S_NOR_B64()
+    {
+    } // ~Inst_SOP2__S_NOR_B64
+
+    // --- description from .arch file ---
+    // D.u64 = ~(S0.u64 | S1.u64);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_NOR_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = ~(src0.rawData() | src1.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_XNOR_B32 class methods ---
+
+    Inst_SOP2__S_XNOR_B32::Inst_SOP2__S_XNOR_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_xnor_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_XNOR_B32
+
+    Inst_SOP2__S_XNOR_B32::~Inst_SOP2__S_XNOR_B32()
+    {
+    } // ~Inst_SOP2__S_XNOR_B32
+
+    // --- description from .arch file ---
+    // D.u = ~(S0.u ^ S1.u);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_XNOR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = ~(src0.rawData() ^ src1.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_XNOR_B64 class methods ---
+
+    Inst_SOP2__S_XNOR_B64::Inst_SOP2__S_XNOR_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_xnor_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_XNOR_B64
+
+    Inst_SOP2__S_XNOR_B64::~Inst_SOP2__S_XNOR_B64()
+    {
+    } // ~Inst_SOP2__S_XNOR_B64
+
+    // --- description from .arch file ---
+    // D.u64 = ~(S0.u64 ^ S1.u64);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_XNOR_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = ~(src0.rawData() ^ src1.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_LSHL_B32 class methods ---
+
+    Inst_SOP2__S_LSHL_B32::Inst_SOP2__S_LSHL_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_lshl_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_LSHL_B32
+
+    Inst_SOP2__S_LSHL_B32::~Inst_SOP2__S_LSHL_B32()
+    {
+    } // ~Inst_SOP2__S_LSHL_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u << S1.u[4:0];
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_LSHL_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() << bits(src1.rawData(), 4, 0));
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_LSHL_B64 class methods ---
+
+    Inst_SOP2__S_LSHL_B64::Inst_SOP2__S_LSHL_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_lshl_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_LSHL_B64
+
+    Inst_SOP2__S_LSHL_B64::~Inst_SOP2__S_LSHL_B64()
+    {
+    } // ~Inst_SOP2__S_LSHL_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S0.u64 << S1.u[5:0];
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_LSHL_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() << bits(src1.rawData(), 5, 0));
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_LSHR_B32 class methods ---
+
+    Inst_SOP2__S_LSHR_B32::Inst_SOP2__S_LSHR_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_lshr_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_LSHR_B32
+
+    Inst_SOP2__S_LSHR_B32::~Inst_SOP2__S_LSHR_B32()
+    {
+    } // ~Inst_SOP2__S_LSHR_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u >> S1.u[4:0];
+    // SCC = 1 if result is non-zero.
+    // The vacated bits are set to zero.
+    void
+    Inst_SOP2__S_LSHR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0));
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_LSHR_B64 class methods ---
+
+    Inst_SOP2__S_LSHR_B64::Inst_SOP2__S_LSHR_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_lshr_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_LSHR_B64
+
+    Inst_SOP2__S_LSHR_B64::~Inst_SOP2__S_LSHR_B64()
+    {
+    } // ~Inst_SOP2__S_LSHR_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S0.u64 >> S1.u[5:0];
+    // SCC = 1 if result is non-zero.
+    // The vacated bits are set to zero.
+    void
+    Inst_SOP2__S_LSHR_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0));
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_ASHR_I32 class methods ---
+
+    Inst_SOP2__S_ASHR_I32::Inst_SOP2__S_ASHR_I32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_ashr_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ASHR_I32
+
+    Inst_SOP2__S_ASHR_I32::~Inst_SOP2__S_ASHR_I32()
+    {
+    } // ~Inst_SOP2__S_ASHR_I32
+
+    // --- description from .arch file ---
+    // D.i = signext(S0.i) >> S1.u[4:0];
+    // SCC = 1 if result is non-zero.
+    // The vacated bits are set to the sign bit of the input value.
+    void
+    Inst_SOP2__S_ASHR_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0));
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_ASHR_I64 class methods ---
+
+    Inst_SOP2__S_ASHR_I64::Inst_SOP2__S_ASHR_I64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_ashr_i64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ASHR_I64
+
+    Inst_SOP2__S_ASHR_I64::~Inst_SOP2__S_ASHR_I64()
+    {
+    } // ~Inst_SOP2__S_ASHR_I64
+
+    // --- description from .arch file ---
+    // D.i64 = signext(S0.i64) >> S1.u[5:0];
+    // SCC = 1 if result is non-zero.
+    // The vacated bits are set to the sign bit of the input value.
+    void
+    Inst_SOP2__S_ASHR_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0));
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_BFM_B32 class methods ---
+
+    Inst_SOP2__S_BFM_B32::Inst_SOP2__S_BFM_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_bfm_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_BFM_B32
+
+    Inst_SOP2__S_BFM_B32::~Inst_SOP2__S_BFM_B32()
+    {
+    } // ~Inst_SOP2__S_BFM_B32
+
+    // --- description from .arch file ---
+    // D.u = ((1 << S0.u[4:0]) - 1) << S1.u[4:0] (bitfield mask).
+    void
+    Inst_SOP2__S_BFM_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+
+        src0.read();
+        src1.read();
+
+        sdst = ((1 << bits(src0.rawData(), 4, 0)) - 1)
+            << bits(src1.rawData(), 4, 0);
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP2__S_BFM_B64 class methods ---
+
+    Inst_SOP2__S_BFM_B64::Inst_SOP2__S_BFM_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_bfm_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_BFM_B64
+
+    Inst_SOP2__S_BFM_B64::~Inst_SOP2__S_BFM_B64()
+    {
+    } // ~Inst_SOP2__S_BFM_B64
+
+    // --- description from .arch file ---
+    // D.u64 = ((1ULL << S0.u[5:0]) - 1) << S1.u[5:0] (bitfield mask).
+    void
+    Inst_SOP2__S_BFM_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+
+        src0.read();
+        src1.read();
+
+        sdst = ((1ULL << bits(src0.rawData(), 5, 0)) - 1)
+            << bits(src1.rawData(), 5, 0);
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP2__S_MUL_I32 class methods ---
+
+    Inst_SOP2__S_MUL_I32::Inst_SOP2__S_MUL_I32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_mul_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_MUL_I32
+
+    Inst_SOP2__S_MUL_I32::~Inst_SOP2__S_MUL_I32()
+    {
+    } // ~Inst_SOP2__S_MUL_I32
+
+    // --- description from .arch file ---
+    // D.i = S0.i * S1.i.
+    void
+    Inst_SOP2__S_MUL_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() * src1.rawData();
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP2__S_BFE_U32 class methods ---
+
+    Inst_SOP2__S_BFE_U32::Inst_SOP2__S_BFE_U32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_bfe_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_BFE_U32
+
+    Inst_SOP2__S_BFE_U32::~Inst_SOP2__S_BFE_U32()
+    {
+    } // ~Inst_SOP2__S_BFE_U32
+
+    // --- description from .arch file ---
+    // Bit field extract. S0 is Data, S1[4:0] is field offset, S1[22:16] is
+    // field width.
+    // D.u = (S0.u>>S1.u[4:0]) & ((1<<S1.u[22:16])-1);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_BFE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0))
+            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_BFE_I32 class methods ---
+
+    Inst_SOP2__S_BFE_I32::Inst_SOP2__S_BFE_I32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_bfe_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_BFE_I32
+
+    Inst_SOP2__S_BFE_I32::~Inst_SOP2__S_BFE_I32()
+    {
+    } // ~Inst_SOP2__S_BFE_I32
+
+    // --- description from .arch file ---
+    // Bit field extract. S0 is Data, S1[4:0] is field offset, S1[22:16] is
+    // field width.
+    // D.i = (S0.i>>S1.u[4:0]) & ((1<<S1.u[22:16])-1);
+    // Sign-extend the result;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_BFE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0))
+            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
+
+        // Above extracted a signed int of size src1[22:16] bits which needs
+        // to be signed-extended. Check if the MSB of our src1[22:16]-bit
+        // integer is 1, and sign extend it is.
+        //
+        // Note: The description in the Vega ISA manual does not mention to
+        // sign-extend the result. An update description can be found in the
+        // more recent RDNA3 manual here:
+        // https://developer.amd.com/wp-content/resources/
+        //      RDNA3_Shader_ISA_December2022.pdf
+        if (sdst.rawData() >> (bits(src1.rawData(), 22, 16) - 1)) {
+            sdst = sdst.rawData()
+                 | (0xffffffff << bits(src1.rawData(), 22, 16));
+        }
+
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_BFE_U64 class methods ---
+
+    Inst_SOP2__S_BFE_U64::Inst_SOP2__S_BFE_U64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_bfe_u64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_BFE_U64
+
+    Inst_SOP2__S_BFE_U64::~Inst_SOP2__S_BFE_U64()
+    {
+    } // ~Inst_SOP2__S_BFE_U64
+
+    // --- description from .arch file ---
+    // Bit field extract. S0 is Data, S1[5:0] is field offset, S1[22:16] is
+    // field width.
+    // D.u64 = (S0.u64>>S1.u[5:0]) & ((1<<S1.u[22:16])-1);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_BFE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0))
+            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_BFE_I64 class methods ---
+
+    Inst_SOP2__S_BFE_I64::Inst_SOP2__S_BFE_I64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_bfe_i64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_BFE_I64
+
+    Inst_SOP2__S_BFE_I64::~Inst_SOP2__S_BFE_I64()
+    {
+    } // ~Inst_SOP2__S_BFE_I64
+
+    // --- description from .arch file ---
+    // Bit field extract. S0 is Data, S1[5:0] is field offset, S1[22:16] is
+    // field width.
+    // D.i64 = (S0.i64>>S1.u[5:0]) & ((1<<S1.u[22:16])-1);
+    // Sign-extend result;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_BFE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0))
+            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
+
+        // Above extracted a signed int of size src1[22:16] bits which needs
+        // to be signed-extended. Check if the MSB of our src1[22:16]-bit
+        // integer is 1, and sign extend it is.
+        if (sdst.rawData() >> (bits(src1.rawData(), 22, 16) - 1)) {
+            sdst = sdst.rawData()
+                 | 0xffffffffffffffff << bits(src1.rawData(), 22, 16);
+        }
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_CBRANCH_G_FORK class methods ---
+
+    Inst_SOP2__S_CBRANCH_G_FORK::Inst_SOP2__S_CBRANCH_G_FORK(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_cbranch_g_fork")
+    {
+        setFlag(Branch);
+    } // Inst_SOP2__S_CBRANCH_G_FORK
+
+    Inst_SOP2__S_CBRANCH_G_FORK::~Inst_SOP2__S_CBRANCH_G_FORK()
+    {
+    } // ~Inst_SOP2__S_CBRANCH_G_FORK
+
+    // --- description from .arch file ---
+    // mask_pass = S0.u64 & EXEC;
+    // mask_fail = ~S0.u64 & EXEC;
+    // if (mask_pass == EXEC)
+    //     PC = S1.u64;
+    // elsif (mask_fail == EXEC)
+    //     PC += 4;
+    // elsif (bitcount(mask_fail) < bitcount(mask_pass))
+    //     EXEC = mask_fail;
+    //     SGPR[CSP*4] = { S1.u64, mask_pass };
+    //     CSP++;
+    //     PC += 4;
+    // else
+    //     EXEC = mask_pass;
+    //     SGPR[CSP*4] = { PC + 4, mask_fail };
+    //     CSP++;
+    //     PC = S1.u64;
+    // end.
+    // Conditional branch using branch-stack.
+    // S0 = compare mask(vcc or any sgpr) and
+    // S1 = 64-bit byte address of target instruction.
+    // See also S_CBRANCH_JOIN.
+    void
+    Inst_SOP2__S_CBRANCH_G_FORK::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOP2__S_ABSDIFF_I32 class methods ---
+
+    Inst_SOP2__S_ABSDIFF_I32::Inst_SOP2__S_ABSDIFF_I32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_absdiff_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ABSDIFF_I32
+
+    Inst_SOP2__S_ABSDIFF_I32::~Inst_SOP2__S_ABSDIFF_I32()
+    {
+    } // ~Inst_SOP2__S_ABSDIFF_I32
+
+    // --- description from .arch file ---
+    // D.i = S0.i - S1.i;
+    // if (D.i < 0) then D.i = -D.i;
+    // SCC = 1 if result is non-zero.
+    // Compute the absolute value of difference between two values.
+    void
+    Inst_SOP2__S_ABSDIFF_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        sdst = std::abs(src0.rawData() - src1.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_RFE_RESTORE_B64 class methods ---
+
+    Inst_SOP2__S_RFE_RESTORE_B64::Inst_SOP2__S_RFE_RESTORE_B64(
+          InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_rfe_restore_b64")
+    {
+    } // Inst_SOP2__S_RFE_RESTORE_B64
+
+    Inst_SOP2__S_RFE_RESTORE_B64::~Inst_SOP2__S_RFE_RESTORE_B64()
+    {
+    } // ~Inst_SOP2__S_RFE_RESTORE_B64
+
+    // --- description from .arch file ---
+    // PRIV = 0;
+    // PC = S0.u64;
+    // INST_ATC = S1.u32[0].
+    // Return from exception handler and continue, possibly changing the
+    // ---  instruction ATC mode.
+    // This instruction may only be used within a trap handler.
+    // Use this instruction when the main program may be in a different memory
+    // ---  space than the trap handler.
+    void
+    Inst_SOP2__S_RFE_RESTORE_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOP2__S_MUL_HI_U32 class methods ---
+
+    Inst_SOP2__S_MUL_HI_U32::Inst_SOP2__S_MUL_HI_U32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_mul_hi_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_MUL_HI_U32
+
+    Inst_SOP2__S_MUL_HI_U32::~Inst_SOP2__S_MUL_HI_U32()
+    {
+    } // ~Inst_SOP2__S_MUL_HI_U32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u * S1.u) >> 32;
+    void
+    Inst_SOP2__S_MUL_HI_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+
+        src0.read();
+        src1.read();
+
+        VecElemU64 tmp_dst =
+            ((VecElemU64)src0.rawData() * (VecElemU64)src1.rawData());
+        sdst = (tmp_dst >> 32);
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP2__S_MUL_HI_I32 class methods ---
+
+    Inst_SOP2__S_MUL_HI_I32::Inst_SOP2__S_MUL_HI_I32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_mul_hi_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_MUL_HI_I32
+
+    Inst_SOP2__S_MUL_HI_I32::~Inst_SOP2__S_MUL_HI_I32()
+    {
+    } // ~Inst_SOP2__S_MUL_HI_I32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u * S1.u) >> 32;
+    void
+    Inst_SOP2__S_MUL_HI_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src0.read();
+        src1.read();
+
+        VecElemI64 tmp_src0 =
+            sext<std::numeric_limits<VecElemI64>::digits>(src0.rawData());
+        VecElemI64 tmp_src1 =
+            sext<std::numeric_limits<VecElemI64>::digits>(src1.rawData());
+        sdst = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);
+
+        sdst.write();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/sopc.cc b/src/arch/amdgpu/vega/insts/sopc.cc
new file mode 100644
index 0000000000..9c58688e53
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/sopc.cc
@@ -0,0 +1,599 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_SOPC__S_CMP_EQ_I32 class methods ---
+
+    Inst_SOPC__S_CMP_EQ_I32::Inst_SOPC__S_CMP_EQ_I32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_eq_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_EQ_I32
+
+    Inst_SOPC__S_CMP_EQ_I32::~Inst_SOPC__S_CMP_EQ_I32()
+    {
+    } // ~Inst_SOPC__S_CMP_EQ_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i == S1.i).
+    void
+    Inst_SOPC__S_CMP_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() == src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_LG_I32 class methods ---
+
+    Inst_SOPC__S_CMP_LG_I32::Inst_SOPC__S_CMP_LG_I32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_lg_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_LG_I32
+
+    Inst_SOPC__S_CMP_LG_I32::~Inst_SOPC__S_CMP_LG_I32()
+    {
+    } // ~Inst_SOPC__S_CMP_LG_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i != S1.i).
+    void
+    Inst_SOPC__S_CMP_LG_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() != src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_GT_I32 class methods ---
+
+    Inst_SOPC__S_CMP_GT_I32::Inst_SOPC__S_CMP_GT_I32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_gt_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_GT_I32
+
+    Inst_SOPC__S_CMP_GT_I32::~Inst_SOPC__S_CMP_GT_I32()
+    {
+    } // ~Inst_SOPC__S_CMP_GT_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i > S1.i).
+    void
+    Inst_SOPC__S_CMP_GT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_GE_I32 class methods ---
+
+    Inst_SOPC__S_CMP_GE_I32::Inst_SOPC__S_CMP_GE_I32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_ge_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_GE_I32
+
+    Inst_SOPC__S_CMP_GE_I32::~Inst_SOPC__S_CMP_GE_I32()
+    {
+    } // ~Inst_SOPC__S_CMP_GE_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i >= S1.i).
+    void
+    Inst_SOPC__S_CMP_GE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() >= src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_LT_I32 class methods ---
+
+    Inst_SOPC__S_CMP_LT_I32::Inst_SOPC__S_CMP_LT_I32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_lt_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_LT_I32
+
+    Inst_SOPC__S_CMP_LT_I32::~Inst_SOPC__S_CMP_LT_I32()
+    {
+    } // ~Inst_SOPC__S_CMP_LT_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i < S1.i).
+    void
+    Inst_SOPC__S_CMP_LT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_LE_I32 class methods ---
+
+    Inst_SOPC__S_CMP_LE_I32::Inst_SOPC__S_CMP_LE_I32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_le_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_LE_I32
+
+    Inst_SOPC__S_CMP_LE_I32::~Inst_SOPC__S_CMP_LE_I32()
+    {
+    } // ~Inst_SOPC__S_CMP_LE_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i <= S1.i).
+    void
+    Inst_SOPC__S_CMP_LE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() <= src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_EQ_U32 class methods ---
+
+    Inst_SOPC__S_CMP_EQ_U32::Inst_SOPC__S_CMP_EQ_U32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_eq_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_EQ_U32
+
+    Inst_SOPC__S_CMP_EQ_U32::~Inst_SOPC__S_CMP_EQ_U32()
+    {
+    } // ~Inst_SOPC__S_CMP_EQ_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u == S1.u).
+    void
+    Inst_SOPC__S_CMP_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() == src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_LG_U32 class methods ---
+
+    Inst_SOPC__S_CMP_LG_U32::Inst_SOPC__S_CMP_LG_U32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_lg_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_LG_U32
+
+    Inst_SOPC__S_CMP_LG_U32::~Inst_SOPC__S_CMP_LG_U32()
+    {
+    } // ~Inst_SOPC__S_CMP_LG_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u != S1.u).
+    void
+    Inst_SOPC__S_CMP_LG_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() != src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_GT_U32 class methods ---
+
+    Inst_SOPC__S_CMP_GT_U32::Inst_SOPC__S_CMP_GT_U32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_gt_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_GT_U32
+
+    Inst_SOPC__S_CMP_GT_U32::~Inst_SOPC__S_CMP_GT_U32()
+    {
+    } // ~Inst_SOPC__S_CMP_GT_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u > S1.u).
+    void
+    Inst_SOPC__S_CMP_GT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_GE_U32 class methods ---
+
+    Inst_SOPC__S_CMP_GE_U32::Inst_SOPC__S_CMP_GE_U32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_ge_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_GE_U32
+
+    Inst_SOPC__S_CMP_GE_U32::~Inst_SOPC__S_CMP_GE_U32()
+    {
+    } // ~Inst_SOPC__S_CMP_GE_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u >= S1.u).
+    void
+    Inst_SOPC__S_CMP_GE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() >= src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_LT_U32 class methods ---
+
+    Inst_SOPC__S_CMP_LT_U32::Inst_SOPC__S_CMP_LT_U32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_lt_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_LT_U32
+
+    Inst_SOPC__S_CMP_LT_U32::~Inst_SOPC__S_CMP_LT_U32()
+    {
+    } // ~Inst_SOPC__S_CMP_LT_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u < S1.u).
+    void
+    Inst_SOPC__S_CMP_LT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_LE_U32 class methods ---
+
+    Inst_SOPC__S_CMP_LE_U32::Inst_SOPC__S_CMP_LE_U32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_le_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_LE_U32
+
+    Inst_SOPC__S_CMP_LE_U32::~Inst_SOPC__S_CMP_LE_U32()
+    {
+    } // ~Inst_SOPC__S_CMP_LE_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u <= S1.u).
+    void
+    Inst_SOPC__S_CMP_LE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() <= src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_BITCMP0_B32 class methods ---
+
+    Inst_SOPC__S_BITCMP0_B32::Inst_SOPC__S_BITCMP0_B32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_bitcmp0_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_BITCMP0_B32
+
+    Inst_SOPC__S_BITCMP0_B32::~Inst_SOPC__S_BITCMP0_B32()
+    {
+    } // ~Inst_SOPC__S_BITCMP0_B32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u[S1.u[4:0]] == 0).
+    void
+    Inst_SOPC__S_BITCMP0_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = !bits(src0.rawData(), bits(src1.rawData(), 4, 0)) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_BITCMP1_B32 class methods ---
+
+    Inst_SOPC__S_BITCMP1_B32::Inst_SOPC__S_BITCMP1_B32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_bitcmp1_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_BITCMP1_B32
+
+    Inst_SOPC__S_BITCMP1_B32::~Inst_SOPC__S_BITCMP1_B32()
+    {
+    } // ~Inst_SOPC__S_BITCMP1_B32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u[S1.u[4:0]] == 1).
+    void
+    Inst_SOPC__S_BITCMP1_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = bits(src0.rawData(), bits(src1.rawData(), 4, 0)) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_BITCMP0_B64 class methods ---
+
+    Inst_SOPC__S_BITCMP0_B64::Inst_SOPC__S_BITCMP0_B64(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_bitcmp0_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_BITCMP0_B64
+
+    Inst_SOPC__S_BITCMP0_B64::~Inst_SOPC__S_BITCMP0_B64()
+    {
+    } // ~Inst_SOPC__S_BITCMP0_B64
+
+    // --- description from .arch file ---
+    // SCC = (S0.u64[S1.u[5:0]] == 0).
+    void
+    Inst_SOPC__S_BITCMP0_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = !bits(src0.rawData(), bits(src1.rawData(), 5, 0)) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_BITCMP1_B64 class methods ---
+
+    Inst_SOPC__S_BITCMP1_B64::Inst_SOPC__S_BITCMP1_B64(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_bitcmp1_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_BITCMP1_B64
+
+    Inst_SOPC__S_BITCMP1_B64::~Inst_SOPC__S_BITCMP1_B64()
+    {
+    } // ~Inst_SOPC__S_BITCMP1_B64
+
+    // --- description from .arch file ---
+    // SCC = (S0.u64[S1.u[5:0]] == 1).
+    void
+    Inst_SOPC__S_BITCMP1_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = bits(src0.rawData(), bits(src1.rawData(), 5, 0)) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_SETVSKIP class methods ---
+
+    Inst_SOPC__S_SETVSKIP::Inst_SOPC__S_SETVSKIP(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_setvskip")
+    {
+    } // Inst_SOPC__S_SETVSKIP
+
+    Inst_SOPC__S_SETVSKIP::~Inst_SOPC__S_SETVSKIP()
+    {
+    } // ~Inst_SOPC__S_SETVSKIP
+
+    // --- description from .arch file ---
+    // VSKIP = S0.u[S1.u[4:0]].
+    // Enables and disables VSKIP mode.
+    // When VSKIP is enabled, no VOP*/M*BUF/MIMG/DS/FLAT/EXP instuctions are
+    // issued.
+    // If any vector operations are outstanding, S_WAITCNT must be issued
+    // before executing.
+    // This instruction requires one waitstate after executing (e.g. S_NOP 0).
+    // Example:
+    //     s_waitcnt 0
+    //     s_setvskip 1, 0  // Enable vskip mode.
+    //     s_nop 1
+    void
+    Inst_SOPC__S_SETVSKIP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPC__S_SET_GPR_IDX_ON class methods ---
+
+    Inst_SOPC__S_SET_GPR_IDX_ON::Inst_SOPC__S_SET_GPR_IDX_ON(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_set_gpr_idx_on")
+    {
+    } // Inst_SOPC__S_SET_GPR_IDX_ON
+
+    Inst_SOPC__S_SET_GPR_IDX_ON::~Inst_SOPC__S_SET_GPR_IDX_ON()
+    {
+    } // ~Inst_SOPC__S_SET_GPR_IDX_ON
+
+    // --- description from .arch file ---
+    // MODE.gpr_idx_en = 1;
+    // M0[7:0] = S0.u[7:0];
+    // M0[15:12] = SIMM4 (direct contents of S1 field);
+    // // Remaining bits of M0 are unmodified.
+    // Enable GPR indexing mode. Vector operations after this will perform
+    // relative GPR addressing based on the contents of M0. The structure
+    // SQ_M0_GPR_IDX_WORD may be used to decode M0.
+    // The raw contents of the S1 field are read and used to set the enable
+    // bits. S1[0] = VSRC0_REL, S1[1] = VSRC1_REL, S1[2] = VSRC2_REL and
+    // S1[3] = VDST_REL.
+    void
+    Inst_SOPC__S_SET_GPR_IDX_ON::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPC__S_CMP_EQ_U64 class methods ---
+
+    Inst_SOPC__S_CMP_EQ_U64::Inst_SOPC__S_CMP_EQ_U64(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_eq_u64")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_EQ_U64
+
+    Inst_SOPC__S_CMP_EQ_U64::~Inst_SOPC__S_CMP_EQ_U64()
+    {
+    } // ~Inst_SOPC__S_CMP_EQ_U64
+
+    // --- description from .arch file ---
+    // SCC = (S0.i64 == S1.i64).
+    void
+    Inst_SOPC__S_CMP_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() == src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_LG_U64 class methods ---
+
+    Inst_SOPC__S_CMP_LG_U64::Inst_SOPC__S_CMP_LG_U64(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_lg_u64")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_LG_U64
+
+    Inst_SOPC__S_CMP_LG_U64::~Inst_SOPC__S_CMP_LG_U64()
+    {
+    } // ~Inst_SOPC__S_CMP_LG_U64
+
+    // --- description from .arch file ---
+    // SCC = (S0.i64 != S1.i64).
+    void
+    Inst_SOPC__S_CMP_LG_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() != src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/sopk.cc b/src/arch/amdgpu/vega/insts/sopk.cc
new file mode 100644
index 0000000000..7abbb9abb4
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/sopk.cc
@@ -0,0 +1,648 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+#include "dev/amdgpu/hwreg_defines.hh"
+#include "gpu-compute/shader.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_SOPK__S_MOVK_I32 class methods ---
+
+    Inst_SOPK__S_MOVK_I32::Inst_SOPK__S_MOVK_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_movk_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_MOVK_I32
+
+    Inst_SOPK__S_MOVK_I32::~Inst_SOPK__S_MOVK_I32()
+    {
+    } // ~Inst_SOPK__S_MOVK_I32
+
+    // --- description from .arch file ---
+    // D.i = signext(SIMM16) (sign extension).
+    void
+    Inst_SOPK__S_MOVK_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        sdst = simm16;
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOPK__S_CMOVK_I32 class methods ---
+
+    Inst_SOPK__S_CMOVK_I32::Inst_SOPK__S_CMOVK_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmovk_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMOVK_I32
+
+    Inst_SOPK__S_CMOVK_I32::~Inst_SOPK__S_CMOVK_I32()
+    {
+    } // ~Inst_SOPK__S_CMOVK_I32
+
+    // --- description from .arch file ---
+    // if (SCC) then D.i = signext(SIMM16);
+    // else NOP.
+    // Conditional move with sign extension.
+    void
+    Inst_SOPK__S_CMOVK_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        scc.read();
+
+        if (scc.rawData()) {
+            sdst = simm16;
+            sdst.write();
+        }
+    } // execute
+    // --- Inst_SOPK__S_CMPK_EQ_I32 class methods ---
+
+    Inst_SOPK__S_CMPK_EQ_I32::Inst_SOPK__S_CMPK_EQ_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_eq_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_EQ_I32
+
+    Inst_SOPK__S_CMPK_EQ_I32::~Inst_SOPK__S_CMPK_EQ_I32()
+    {
+    } // ~Inst_SOPK__S_CMPK_EQ_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i == signext(SIMM16)).
+    void
+    Inst_SOPK__S_CMPK_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() == simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_LG_I32 class methods ---
+
+    Inst_SOPK__S_CMPK_LG_I32::Inst_SOPK__S_CMPK_LG_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_lg_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_LG_I32
+
+    Inst_SOPK__S_CMPK_LG_I32::~Inst_SOPK__S_CMPK_LG_I32()
+    {
+    } // ~Inst_SOPK__S_CMPK_LG_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i != signext(SIMM16)).
+    void
+    Inst_SOPK__S_CMPK_LG_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() != simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_GT_I32 class methods ---
+
+    Inst_SOPK__S_CMPK_GT_I32::Inst_SOPK__S_CMPK_GT_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_gt_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_GT_I32
+
+    Inst_SOPK__S_CMPK_GT_I32::~Inst_SOPK__S_CMPK_GT_I32()
+    {
+    } // ~Inst_SOPK__S_CMPK_GT_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i > signext(SIMM16)).
+    void
+    Inst_SOPK__S_CMPK_GT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() > simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_GE_I32 class methods ---
+
+    Inst_SOPK__S_CMPK_GE_I32::Inst_SOPK__S_CMPK_GE_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_ge_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_GE_I32
+
+    Inst_SOPK__S_CMPK_GE_I32::~Inst_SOPK__S_CMPK_GE_I32()
+    {
+    } // ~Inst_SOPK__S_CMPK_GE_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i >= signext(SIMM16)).
+    void
+    Inst_SOPK__S_CMPK_GE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() >= simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_LT_I32 class methods ---
+
+    Inst_SOPK__S_CMPK_LT_I32::Inst_SOPK__S_CMPK_LT_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_lt_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_LT_I32
+
+    Inst_SOPK__S_CMPK_LT_I32::~Inst_SOPK__S_CMPK_LT_I32()
+    {
+    } // ~Inst_SOPK__S_CMPK_LT_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i < signext(SIMM16)).
+    void
+    Inst_SOPK__S_CMPK_LT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() < simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_LE_I32 class methods ---
+
+    Inst_SOPK__S_CMPK_LE_I32::Inst_SOPK__S_CMPK_LE_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_le_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_LE_I32
+
+    Inst_SOPK__S_CMPK_LE_I32::~Inst_SOPK__S_CMPK_LE_I32()
+    {
+    } // ~Inst_SOPK__S_CMPK_LE_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i <= signext(SIMM16)).
+    void
+    Inst_SOPK__S_CMPK_LE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() <= simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_EQ_U32 class methods ---
+
+    Inst_SOPK__S_CMPK_EQ_U32::Inst_SOPK__S_CMPK_EQ_U32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_eq_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_EQ_U32
+
+    Inst_SOPK__S_CMPK_EQ_U32::~Inst_SOPK__S_CMPK_EQ_U32()
+    {
+    } // ~Inst_SOPK__S_CMPK_EQ_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u == SIMM16).
+    void
+    Inst_SOPK__S_CMPK_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
+        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() == simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_LG_U32 class methods ---
+
+    Inst_SOPK__S_CMPK_LG_U32::Inst_SOPK__S_CMPK_LG_U32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_lg_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_LG_U32
+
+    Inst_SOPK__S_CMPK_LG_U32::~Inst_SOPK__S_CMPK_LG_U32()
+    {
+    } // ~Inst_SOPK__S_CMPK_LG_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u != SIMM16).
+    void
+    Inst_SOPK__S_CMPK_LG_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
+        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() != simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_GT_U32 class methods ---
+
+    Inst_SOPK__S_CMPK_GT_U32::Inst_SOPK__S_CMPK_GT_U32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_gt_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_GT_U32
+
+    Inst_SOPK__S_CMPK_GT_U32::~Inst_SOPK__S_CMPK_GT_U32()
+    {
+    } // ~Inst_SOPK__S_CMPK_GT_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u > SIMM16).
+    void
+    Inst_SOPK__S_CMPK_GT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
+        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() > simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_GE_U32 class methods ---
+
+    Inst_SOPK__S_CMPK_GE_U32::Inst_SOPK__S_CMPK_GE_U32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_ge_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_GE_U32
+
+    Inst_SOPK__S_CMPK_GE_U32::~Inst_SOPK__S_CMPK_GE_U32()
+    {
+    } // ~Inst_SOPK__S_CMPK_GE_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u >= SIMM16).
+    void
+    Inst_SOPK__S_CMPK_GE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
+        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() >= simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_LT_U32 class methods ---
+
+    Inst_SOPK__S_CMPK_LT_U32::Inst_SOPK__S_CMPK_LT_U32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_lt_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_LT_U32
+
+    Inst_SOPK__S_CMPK_LT_U32::~Inst_SOPK__S_CMPK_LT_U32()
+    {
+    } // ~Inst_SOPK__S_CMPK_LT_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u < SIMM16).
+    void
+    Inst_SOPK__S_CMPK_LT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
+        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() < simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_LE_U32 class methods ---
+
+    Inst_SOPK__S_CMPK_LE_U32::Inst_SOPK__S_CMPK_LE_U32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_le_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_LE_U32
+
+    Inst_SOPK__S_CMPK_LE_U32::~Inst_SOPK__S_CMPK_LE_U32()
+    {
+    } // ~Inst_SOPK__S_CMPK_LE_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u <= SIMM16).
+    void
+    Inst_SOPK__S_CMPK_LE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
+        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() <= simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_ADDK_I32 class methods ---
+
+    Inst_SOPK__S_ADDK_I32::Inst_SOPK__S_ADDK_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_addk_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_ADDK_I32
+
+    Inst_SOPK__S_ADDK_I32::~Inst_SOPK__S_ADDK_I32()
+    {
+    } // ~Inst_SOPK__S_ADDK_I32
+
+    // --- description from .arch file ---
+    // D.i = D.i + signext(SIMM16);
+    // SCC = overflow.
+    void
+    Inst_SOPK__S_ADDK_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI16 simm16 = instData.SIMM16;
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = src.rawData() + (ScalarRegI32)sext<16>(simm16);
+        scc = (bits(src.rawData(), 31) == bits(simm16, 15)
+            && bits(src.rawData(), 31) != bits(sdst.rawData(), 31)) ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_MULK_I32 class methods ---
+
+    Inst_SOPK__S_MULK_I32::Inst_SOPK__S_MULK_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_mulk_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_MULK_I32
+
+    Inst_SOPK__S_MULK_I32::~Inst_SOPK__S_MULK_I32()
+    {
+    } // ~Inst_SOPK__S_MULK_I32
+
+    // --- description from .arch file ---
+    // D.i = D.i * signext(SIMM16).
+    void
+    Inst_SOPK__S_MULK_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI16 simm16 = instData.SIMM16;
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = src.rawData() * (ScalarRegI32)sext<16>(simm16);
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOPK__S_CBRANCH_I_FORK class methods ---
+
+    Inst_SOPK__S_CBRANCH_I_FORK::Inst_SOPK__S_CBRANCH_I_FORK(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cbranch_i_fork")
+    {
+        setFlag(Branch);
+    } // Inst_SOPK__S_CBRANCH_I_FORK
+
+    Inst_SOPK__S_CBRANCH_I_FORK::~Inst_SOPK__S_CBRANCH_I_FORK()
+    {
+    } // ~Inst_SOPK__S_CBRANCH_I_FORK
+
+    // --- description from .arch file ---
+    // mask_pass = S0.u64 & EXEC;
+    // mask_fail = ~S0.u64 & EXEC;
+    // target_addr = PC + signext(SIMM16 * 4) + 4;
+    // if (mask_pass == EXEC)
+    //     PC = target_addr;
+    // elsif (mask_fail == EXEC)
+    //     PC += 4;
+    // elsif (bitcount(mask_fail) < bitcount(mask_pass))
+    //     EXEC = mask_fail;
+    //     SGPR[CSP*4] = { target_addr, mask_pass };
+    //     CSP++;
+    //     PC += 4;
+    // else
+    //     EXEC = mask_pass;
+    //     SGPR[CSP*4] = { PC + 4, mask_fail };
+    //     CSP++;
+    //     PC = target_addr;
+    // end.
+    // Conditional branch using branch-stack.
+    // S0 = compare mask(vcc or any sgpr), and
+    // SIMM16 = signed DWORD branch offset relative to next instruction.
+    // See also S_CBRANCH_JOIN.
+    void
+    Inst_SOPK__S_CBRANCH_I_FORK::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPK__S_GETREG_B32 class methods ---
+
+    Inst_SOPK__S_GETREG_B32::Inst_SOPK__S_GETREG_B32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_getreg_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_GETREG_B32
+
+    Inst_SOPK__S_GETREG_B32::~Inst_SOPK__S_GETREG_B32()
+    {
+    } // ~Inst_SOPK__S_GETREG_B32
+
+    // --- description from .arch file ---
+    // D.u = hardware-reg. Read some or all of a hardware register into the
+    // LSBs of D.
+    // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
+    // is 1..32.
+    void
+    Inst_SOPK__S_GETREG_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI16 simm16 = instData.SIMM16;
+        ScalarRegU32 hwregId = simm16 & 0x3f;
+        ScalarRegU32 offset = (simm16 >> 6) & 31;
+        ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;
+
+        ScalarRegU32 hwreg =
+            gpuDynInst->computeUnit()->shader->getHwReg(hwregId);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        sdst.read();
+
+        // Store value from hardware to part of the SDST.
+        ScalarRegU32 mask = (((1U << size) - 1U) << offset);
+        sdst = (hwreg & mask) >> offset;
+        sdst.write();
+    } // execute
+    // --- Inst_SOPK__S_SETREG_B32 class methods ---
+
+    Inst_SOPK__S_SETREG_B32::Inst_SOPK__S_SETREG_B32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_setreg_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_SETREG_B32
+
+    Inst_SOPK__S_SETREG_B32::~Inst_SOPK__S_SETREG_B32()
+    {
+    } // ~Inst_SOPK__S_SETREG_B32
+
+    // --- description from .arch file ---
+    // hardware-reg = S0.u. Write some or all of the LSBs of D into a hardware
+    // register.
+    // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
+    // is 1..32.
+    void
+    Inst_SOPK__S_SETREG_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI16 simm16 = instData.SIMM16;
+        ScalarRegU32 hwregId = simm16 & 0x3f;
+        ScalarRegU32 offset = (simm16 >> 6) & 31;
+        ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;
+
+        ScalarRegU32 hwreg =
+            gpuDynInst->computeUnit()->shader->getHwReg(hwregId);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        sdst.read();
+
+        // Store value from SDST to part of the hardware register.
+        ScalarRegU32 mask = (((1U << size) - 1U) << offset);
+        hwreg = ((hwreg & ~mask) | ((sdst.rawData() << offset) & mask));
+        gpuDynInst->computeUnit()->shader->setHwReg(hwregId, hwreg);
+
+        // set MODE register to control the behavior of single precision
+        // floating-point numbers: denormal mode or round mode
+        if (hwregId==1 && size==2
+                        && (offset==4 || offset==0)) {
+            warn_once("Be cautious that s_setreg_b32 has no real effect "
+                            "on FP modes: %s\n", gpuDynInst->disassemble());
+            return;
+        }
+
+        // panic if not changing MODE of floating-point numbers
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPK__S_SETREG_IMM32_B32 class methods ---
+
+    Inst_SOPK__S_SETREG_IMM32_B32::Inst_SOPK__S_SETREG_IMM32_B32(
+          InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_setreg_imm32_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_SETREG_IMM32_B32
+
+    Inst_SOPK__S_SETREG_IMM32_B32::~Inst_SOPK__S_SETREG_IMM32_B32()
+    {
+    } // ~Inst_SOPK__S_SETREG_IMM32_B32
+
+    // --- description from .arch file ---
+    // Write some or all of the LSBs of IMM32 into a hardware register; this
+    // ---  instruction requires a 32-bit literal constant.
+    // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
+    // is 1..32.
+    void
+    Inst_SOPK__S_SETREG_IMM32_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI16 simm16 = instData.SIMM16;
+        ScalarRegU32 hwregId = simm16 & 0x3f;
+        ScalarRegU32 offset = (simm16 >> 6) & 31;
+        ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;
+
+        ScalarRegU32 hwreg =
+            gpuDynInst->computeUnit()->shader->getHwReg(hwregId);
+        ScalarRegI32 simm32 = extData.imm_u32;
+
+        // Store value from SIMM32 to part of the hardware register.
+        ScalarRegU32 mask = (((1U << size) - 1U) << offset);
+        hwreg = ((hwreg & ~mask) | ((simm32 << offset) & mask));
+        gpuDynInst->computeUnit()->shader->setHwReg(hwregId, hwreg);
+
+        // set MODE register to control the behavior of single precision
+        // floating-point numbers: denormal mode or round mode
+        if (hwregId==HW_REG_MODE && size==2
+                        && (offset==4 || offset==0)) {
+            warn_once("Be cautious that s_setreg_imm32_b32 has no real effect "
+                            "on FP modes: %s\n", gpuDynInst->disassemble());
+            return;
+        }
+
+        // panic if not changing modes of single-precision FPs
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/sopp.cc b/src/arch/amdgpu/vega/insts/sopp.cc
new file mode 100644
index 0000000000..df5cdbf681
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/sopp.cc
@@ -0,0 +1,900 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+#include "debug/GPUSync.hh"
+#include "gpu-compute/shader.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_SOPP__S_NOP class methods ---
+
+    Inst_SOPP__S_NOP::Inst_SOPP__S_NOP(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_nop")
+    {
+        setFlag(Nop);
+    } // Inst_SOPP__S_NOP
+
+    Inst_SOPP__S_NOP::~Inst_SOPP__S_NOP()
+    {
+    } // ~Inst_SOPP__S_NOP
+
+    // --- description from .arch file ---
+    // Do nothing. Repeat NOP 1..8 times based on SIMM16[2:0] -- 0 = 1 time,
+    // 7 = 8 times.
+    // This instruction may be used to introduce wait states to resolve
+    // hazards; see the shader programming guide for details. Compare with
+    // S_SLEEP.
+    void
+    Inst_SOPP__S_NOP::execute(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_SOPP__S_ENDPGM class methods ---
+
+    Inst_SOPP__S_ENDPGM::Inst_SOPP__S_ENDPGM(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_endpgm")
+    {
+        setFlag(EndOfKernel);
+    } // Inst_SOPP__S_ENDPGM
+
+    Inst_SOPP__S_ENDPGM::~Inst_SOPP__S_ENDPGM()
+    {
+    } // ~Inst_SOPP__S_ENDPGM
+
+    // --- description from .arch file ---
+    // End of program; terminate wavefront.
+    // The hardware implicitly executes S_WAITCNT 0 before executing this
+    // ---  instruction.
+    // See S_ENDPGM_SAVED for the context-switch version of this instruction.
+    void
+    Inst_SOPP__S_ENDPGM::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ComputeUnit *cu = gpuDynInst->computeUnit();
+
+        // delete extra instructions fetched for completed work-items
+        wf->instructionBuffer.erase(wf->instructionBuffer.begin() + 1,
+            wf->instructionBuffer.end());
+
+        if (wf->pendingFetch) {
+            wf->dropFetch = true;
+        }
+
+        wf->computeUnit->fetchStage.fetchUnit(wf->simdId)
+            .flushBuf(wf->wfSlotId);
+        wf->setStatus(Wavefront::S_STOPPED);
+
+        int refCount = wf->computeUnit->getLds()
+            .decreaseRefCounter(wf->dispatchId, wf->wgId);
+
+        /**
+         * The parent WF of this instruction is exiting, therefore
+         * it should not participate in this barrier any longer. This
+         * prevents possible deadlock issues if WFs exit early.
+         */
+        int bar_id = WFBarrier::InvalidID;
+        if (wf->hasBarrier()) {
+            assert(wf->getStatus() != Wavefront::S_BARRIER);
+            bar_id = wf->barrierId();
+            assert(bar_id != WFBarrier::InvalidID);
+            wf->releaseBarrier();
+            cu->decMaxBarrierCnt(bar_id);
+            DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Exiting the "
+                    "program and decrementing max barrier count for "
+                    "barrier Id%d. New max count: %d.\n", cu->cu_id,
+                    wf->simdId, wf->wfSlotId, wf->wfDynId, bar_id,
+                    cu->maxBarrierCnt(bar_id));
+        }
+
+        DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
+            wf->computeUnit->cu_id, wf->wgId, refCount);
+
+        wf->computeUnit->registerManager->freeRegisters(wf);
+        wf->computeUnit->stats.completedWfs++;
+        wf->computeUnit->activeWaves--;
+
+        panic_if(wf->computeUnit->activeWaves < 0, "CU[%d] Active waves less "
+            "than zero\n", wf->computeUnit->cu_id);
+
+        DPRINTF(GPUExec, "Doing return for CU%d: WF[%d][%d][%d]\n",
+            wf->computeUnit->cu_id, wf->simdId, wf->wfSlotId, wf->wfDynId);
+
+        for (int i = 0; i < wf->vecReads.size(); i++) {
+            if (wf->rawDist.find(i) != wf->rawDist.end()) {
+                wf->stats.readsPerWrite.sample(wf->vecReads.at(i));
+            }
+        }
+        wf->vecReads.clear();
+        wf->rawDist.clear();
+        wf->lastInstExec = 0;
+
+        if (!refCount) {
+            /**
+             * If all WFs have finished, and hence the WG has finished,
+             * then we can free up the barrier belonging to the parent
+             * WG, but only if we actually used a barrier (i.e., more
+             * than one WF in the WG).
+             */
+            if (bar_id != WFBarrier::InvalidID) {
+                DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - All waves are "
+                        "now complete. Releasing barrier Id%d.\n", cu->cu_id,
+                        wf->simdId, wf->wfSlotId, wf->wfDynId,
+                        wf->barrierId());
+                cu->releaseBarrier(bar_id);
+            }
+
+           /**
+             * Last wavefront of the workgroup has executed return. If the
+             * workgroup is not the final one in the kernel, then simply
+             * retire it; however, if it is the final one, i.e., indicating
+             * the kernel end, then release operation (i.e., GL2 WB) is
+             * needed
+             */
+
+            //check whether the workgroup is indicating the kernel end, i.e.,
+            //the last workgroup in the kernel
+            bool kernelEnd =
+                wf->computeUnit->shader->dispatcher().isReachingKernelEnd(wf);
+
+            bool relNeeded =
+                wf->computeUnit->shader->impl_kern_end_rel;
+
+            //if it is not a kernel end, then retire the workgroup directly
+            if (!kernelEnd || !relNeeded) {
+                wf->computeUnit->shader->dispatcher().notifyWgCompl(wf);
+                wf->setStatus(Wavefront::S_STOPPED);
+                wf->computeUnit->stats.completedWGs++;
+
+                return;
+            }
+
+            /**
+             * if it is a kernel end, inject a memory sync, i.e., GL2 WB, and
+             * retire the workgroup after receving response.
+             * note that GL0V and GL1 are read only, and they just forward GL2
+             * WB request. When forwarding, GL1 send the request to all GL2 in
+             * the complex
+             */
+            setFlag(MemSync);
+            setFlag(GlobalSegment);
+            // Notify Memory System of Kernel Completion
+            // Kernel End = isKernel + isMemSync
+            wf->setStatus(Wavefront::S_RETURNING);
+            gpuDynInst->simdId = wf->simdId;
+            gpuDynInst->wfSlotId = wf->wfSlotId;
+            gpuDynInst->wfDynId = wf->wfDynId;
+
+            DPRINTF(GPUExec, "inject global memory fence for CU%d: "
+                            "WF[%d][%d][%d]\n", wf->computeUnit->cu_id,
+                            wf->simdId, wf->wfSlotId, wf->wfDynId);
+
+            // call shader to prepare the flush operations
+            wf->computeUnit->shader->prepareFlush(gpuDynInst);
+
+            wf->computeUnit->stats.completedWGs++;
+        } else {
+            wf->computeUnit->shader->dispatcher().scheduleDispatch();
+        }
+    } // execute
+
+    // --- Inst_SOPP__S_BRANCH class methods ---
+
+    Inst_SOPP__S_BRANCH::Inst_SOPP__S_BRANCH(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_branch")
+    {
+        setFlag(Branch);
+    } // Inst_SOPP__S_BRANCH
+
+    Inst_SOPP__S_BRANCH::~Inst_SOPP__S_BRANCH()
+    {
+    } // ~Inst_SOPP__S_BRANCH
+
+    // --- description from .arch file ---
+    // PC = PC + signext(SIMM16 * 4) + 4 (short jump).
+    // For a long jump, use S_SETPC.
+    void
+    Inst_SOPP__S_BRANCH::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        Addr pc = gpuDynInst->pc();
+        ScalarRegI16 simm16 = instData.SIMM16;
+
+        pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
+
+        wf->pc(pc);
+    } // execute
+    // --- Inst_SOPP__S_WAKEUP class methods ---
+
+    Inst_SOPP__S_WAKEUP::Inst_SOPP__S_WAKEUP(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_wakeup")
+    {
+    } // Inst_SOPP__S_WAKEUP
+
+    Inst_SOPP__S_WAKEUP::~Inst_SOPP__S_WAKEUP()
+    {
+    } // ~Inst_SOPP__S_WAKEUP
+
+    // --- description from .arch file ---
+    // Allow a wave to 'ping' all the other waves in its threadgroup to force
+    // them to wake up immediately from an S_SLEEP instruction. The ping is
+    // ignored if the waves are not sleeping.
+    // This allows for more efficient polling on a memory location. The waves
+    // which are polling can sit in a long S_SLEEP between memory reads, but
+    // the wave which writes the value can tell them all to wake up early now
+    // that the data is available. This is useful for fBarrier implementations
+    // (speedup).
+    // This method is also safe from races because if any wave misses the ping,
+    // everything still works fine (whoever missed it just completes their
+    // normal S_SLEEP).
+    void
+    Inst_SOPP__S_WAKEUP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_SCC0 class methods ---
+
+    Inst_SOPP__S_CBRANCH_SCC0::Inst_SOPP__S_CBRANCH_SCC0(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_scc0")
+    {
+        setFlag(Branch);
+    } // Inst_SOPP__S_CBRANCH_SCC0
+
+    Inst_SOPP__S_CBRANCH_SCC0::~Inst_SOPP__S_CBRANCH_SCC0()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_SCC0
+
+    // --- description from .arch file ---
+    // if (SCC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_SCC0::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        Addr pc = gpuDynInst->pc();
+        ScalarRegI16 simm16 = instData.SIMM16;
+        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        scc.read();
+
+        if (!scc.rawData()) {
+            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
+        }
+
+        wf->pc(pc);
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_SCC1 class methods ---
+
+    Inst_SOPP__S_CBRANCH_SCC1::Inst_SOPP__S_CBRANCH_SCC1(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_scc1")
+    {
+        setFlag(Branch);
+    } // Inst_SOPP__S_CBRANCH_SCC1
+
+    Inst_SOPP__S_CBRANCH_SCC1::~Inst_SOPP__S_CBRANCH_SCC1()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_SCC1
+
+    // --- description from .arch file ---
+    // if (SCC == 1) then PC = PC + signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_SCC1::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        Addr pc = gpuDynInst->pc();
+        ScalarRegI16 simm16 = instData.SIMM16;
+        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        scc.read();
+
+        if (scc.rawData()) {
+            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
+        }
+
+        wf->pc(pc);
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_VCCZ class methods ---
+
+    Inst_SOPP__S_CBRANCH_VCCZ::Inst_SOPP__S_CBRANCH_VCCZ(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_vccz")
+    {
+        setFlag(Branch);
+        setFlag(ReadsVCC);
+    } // Inst_SOPP__S_CBRANCH_VCCZ
+
+    Inst_SOPP__S_CBRANCH_VCCZ::~Inst_SOPP__S_CBRANCH_VCCZ()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_VCCZ
+
+    // --- description from .arch file ---
+    // if (VCC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_VCCZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+        Addr pc = gpuDynInst->pc();
+        ScalarRegI16 simm16 = instData.SIMM16;
+
+        vcc.read();
+
+        if (!vcc.rawData()) {
+            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
+        }
+
+        wf->pc(pc);
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_VCCNZ class methods ---
+
+    Inst_SOPP__S_CBRANCH_VCCNZ::Inst_SOPP__S_CBRANCH_VCCNZ(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_vccnz")
+    {
+        setFlag(Branch);
+        setFlag(ReadsVCC);
+    } // Inst_SOPP__S_CBRANCH_VCCNZ
+
+    Inst_SOPP__S_CBRANCH_VCCNZ::~Inst_SOPP__S_CBRANCH_VCCNZ()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_VCCNZ
+
+    // --- description from .arch file ---
+    // if (VCC != 0) then PC = PC + signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_VCCNZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        vcc.read();
+
+        if (vcc.rawData()) {
+            Addr pc = gpuDynInst->pc();
+            ScalarRegI16 simm16 = instData.SIMM16;
+            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
+            wf->pc(pc);
+        }
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_EXECZ class methods ---
+
+    Inst_SOPP__S_CBRANCH_EXECZ::Inst_SOPP__S_CBRANCH_EXECZ(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_execz")
+    {
+        setFlag(Branch);
+        setFlag(ReadsEXEC);
+    } // Inst_SOPP__S_CBRANCH_EXECZ
+
+    Inst_SOPP__S_CBRANCH_EXECZ::~Inst_SOPP__S_CBRANCH_EXECZ()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_EXECZ
+
+    // --- description from .arch file ---
+    // if (EXEC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_EXECZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (wf->execMask().none()) {
+            Addr pc = gpuDynInst->pc();
+            ScalarRegI16 simm16 = instData.SIMM16;
+            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
+            wf->pc(pc);
+        }
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_EXECNZ class methods ---
+
+    Inst_SOPP__S_CBRANCH_EXECNZ::Inst_SOPP__S_CBRANCH_EXECNZ(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_execnz")
+    {
+        setFlag(Branch);
+        setFlag(ReadsEXEC);
+    } // Inst_SOPP__S_CBRANCH_EXECNZ
+
+    Inst_SOPP__S_CBRANCH_EXECNZ::~Inst_SOPP__S_CBRANCH_EXECNZ()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_EXECNZ
+
+    // --- description from .arch file ---
+    // if (EXEC != 0) then PC = PC + signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_EXECNZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (wf->execMask().any()) {
+            Addr pc = gpuDynInst->pc();
+            ScalarRegI16 simm16 = instData.SIMM16;
+            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
+            wf->pc(pc);
+        }
+    } // execute
+    // --- Inst_SOPP__S_BARRIER class methods ---
+
+    Inst_SOPP__S_BARRIER::Inst_SOPP__S_BARRIER(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_barrier")
+    {
+        setFlag(MemBarrier);
+    } // Inst_SOPP__S_BARRIER
+
+    Inst_SOPP__S_BARRIER::~Inst_SOPP__S_BARRIER()
+    {
+    } // ~Inst_SOPP__S_BARRIER
+
+    // --- description from .arch file ---
+    // Synchronize waves within a threadgroup.
+    // If not all waves of the threadgroup have been created yet, waits for
+    // entire group before proceeding.
+    // If some waves in the threadgroup have already terminated, this waits on
+    // only the surviving waves.
+    // Barriers are legal inside trap handlers.
+    void
+    Inst_SOPP__S_BARRIER::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ComputeUnit *cu = gpuDynInst->computeUnit();
+
+        if (wf->hasBarrier()) {
+            int bar_id = wf->barrierId();
+            assert(wf->getStatus() == Wavefront::S_BARRIER);
+            cu->incNumAtBarrier(bar_id);
+            DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Stalling at "
+                    "barrier Id%d. %d waves now at barrier, %d waves "
+                    "remain.\n", cu->cu_id, wf->simdId, wf->wfSlotId,
+                    wf->wfDynId, bar_id, cu->numAtBarrier(bar_id),
+                    cu->numYetToReachBarrier(bar_id));
+        }
+    } // execute
+    // --- Inst_SOPP__S_SETKILL class methods ---
+
+    Inst_SOPP__S_SETKILL::Inst_SOPP__S_SETKILL(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_setkill")
+    {
+    } // Inst_SOPP__S_SETKILL
+
+    Inst_SOPP__S_SETKILL::~Inst_SOPP__S_SETKILL()
+    {
+    } // ~Inst_SOPP__S_SETKILL
+
+    // --- description from .arch file ---
+    // set KILL bit to value of SIMM16[0].
+    // Used primarily for debugging kill wave host command behavior.
+    void
+    Inst_SOPP__S_SETKILL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_WAITCNT class methods ---
+
+    Inst_SOPP__S_WAITCNT::Inst_SOPP__S_WAITCNT(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_waitcnt")
+    {
+        setFlag(ALU);
+        setFlag(Waitcnt);
+    } // Inst_SOPP__S_WAITCNT
+
+    Inst_SOPP__S_WAITCNT::~Inst_SOPP__S_WAITCNT()
+    {
+    } // ~Inst_SOPP__S_WAITCNT
+
+    // --- description from .arch file ---
+    // Wait for the counts of outstanding lds, vector-memory and
+    // ---  export/vmem-write-data to be at or below the specified levels.
+    // SIMM16[3:0] = vmcount (vector memory operations),
+    // SIMM16[6:4] = export/mem-write-data count,
+    // SIMM16[12:8] = LGKM_cnt (scalar-mem/GDS/LDS count).
+    void
+    Inst_SOPP__S_WAITCNT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 vm_cnt = 0;
+        ScalarRegI32 exp_cnt = 0;
+        ScalarRegI32 lgkm_cnt = 0;
+        vm_cnt = bits<ScalarRegI16>(instData.SIMM16, 3, 0);
+        exp_cnt = bits<ScalarRegI16>(instData.SIMM16, 6, 4);
+        lgkm_cnt = bits<ScalarRegI16>(instData.SIMM16, 12, 8);
+        gpuDynInst->wavefront()->setStatus(Wavefront::S_WAITCNT);
+        gpuDynInst->wavefront()->setWaitCnts(vm_cnt, exp_cnt, lgkm_cnt);
+    } // execute
+    // --- Inst_SOPP__S_SETHALT class methods ---
+
+    Inst_SOPP__S_SETHALT::Inst_SOPP__S_SETHALT(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_sethalt")
+    {
+    } // Inst_SOPP__S_SETHALT
+
+    Inst_SOPP__S_SETHALT::~Inst_SOPP__S_SETHALT()
+    {
+    } // ~Inst_SOPP__S_SETHALT
+
+    // --- description from .arch file ---
+    // Set HALT bit to value of SIMM16[0]; 1 = halt, 0 = resume.
+    // The halt flag is ignored while PRIV == 1 (inside trap handlers) but the
+    // shader will halt immediately after the handler returns if HALT is still
+    // set at that time.
+    void
+    Inst_SOPP__S_SETHALT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_SLEEP class methods ---
+
+    Inst_SOPP__S_SLEEP::Inst_SOPP__S_SLEEP(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_sleep")
+    {
+        setFlag(ALU);
+        setFlag(Sleep);
+    } // Inst_SOPP__S_SLEEP
+
+    Inst_SOPP__S_SLEEP::~Inst_SOPP__S_SLEEP()
+    {
+    } // ~Inst_SOPP__S_SLEEP
+
+    // --- description from .arch file ---
+    // Cause a wave to sleep for (64 * SIMM16[2:0] + 1..64) clocks.
+    // The exact amount of delay is approximate. Compare with S_NOP.
+    void
+    Inst_SOPP__S_SLEEP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)instData.SIMM16;
+        gpuDynInst->wavefront()->setStatus(Wavefront::S_STALLED_SLEEP);
+        // sleep duration is specified in multiples of 64 cycles
+        gpuDynInst->wavefront()->setSleepTime(64 * simm16);
+    } // execute
+    // --- Inst_SOPP__S_SETPRIO class methods ---
+
+    Inst_SOPP__S_SETPRIO::Inst_SOPP__S_SETPRIO(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_setprio")
+    {
+        setFlag(ALU);
+    } // Inst_SOPP__S_SETPRIO
+
+    Inst_SOPP__S_SETPRIO::~Inst_SOPP__S_SETPRIO()
+    {
+    } // ~Inst_SOPP__S_SETPRIO
+
+    // --- description from .arch file ---
+    // User settable wave priority is set to SIMM16[1:0]. 0 = lowest,
+    // 3 = highest.
+    // The overall wave priority is {SPIPrio[1:0] + UserPrio[1:0],
+    // WaveAge[3:0]}.
+    void
+    Inst_SOPP__S_SETPRIO::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegU16 simm16 = instData.SIMM16;
+        ScalarRegU32 userPrio = simm16 & 0x3;
+
+        warn_once("S_SETPRIO ignored -- Requested priority %d\n", userPrio);
+    } // execute
+    // --- Inst_SOPP__S_SENDMSG class methods ---
+
+    Inst_SOPP__S_SENDMSG::Inst_SOPP__S_SENDMSG(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_sendmsg")
+    {
+    } // Inst_SOPP__S_SENDMSG
+
+    Inst_SOPP__S_SENDMSG::~Inst_SOPP__S_SENDMSG()
+    {
+    } // ~Inst_SOPP__S_SENDMSG
+
+    // --- description from .arch file ---
+    // Send a message upstream to VGT or the interrupt handler.
+    // SIMM16[9:0] contains the message type and is documented in the shader
+    // ---  programming guide.
+    void
+    Inst_SOPP__S_SENDMSG::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_SENDMSGHALT class methods ---
+
+    Inst_SOPP__S_SENDMSGHALT::Inst_SOPP__S_SENDMSGHALT(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_sendmsghalt")
+    {
+    } // Inst_SOPP__S_SENDMSGHALT
+
+    Inst_SOPP__S_SENDMSGHALT::~Inst_SOPP__S_SENDMSGHALT()
+    {
+    } // ~Inst_SOPP__S_SENDMSGHALT
+
+    // --- description from .arch file ---
+    // Send a message and then HALT the wavefront; see S_SENDMSG for details.
+    void
+    Inst_SOPP__S_SENDMSGHALT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_TRAP class methods ---
+
+    Inst_SOPP__S_TRAP::Inst_SOPP__S_TRAP(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_trap")
+    {
+    } // Inst_SOPP__S_TRAP
+
+    Inst_SOPP__S_TRAP::~Inst_SOPP__S_TRAP()
+    {
+    } // ~Inst_SOPP__S_TRAP
+
+    // --- description from .arch file ---
+    // TrapID = SIMM16[7:0];
+    // Wait for all instructions to complete;
+    // set {TTMP1, TTMP0} = {3'h0, PCRewind[3:0], HT[0], TrapID[7:0],
+    // PC[47:0]};
+    // PC = TBA (trap base address);
+    // PRIV = 1.
+    // Enter the trap handler. This instruction may be generated internally as
+    // well in response to a host trap (HT = 1) or an exception.
+    // TrapID 0 is reserved for hardware use and should not be used in a
+    // shader-generated trap.
+    void
+    Inst_SOPP__S_TRAP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_ICACHE_INV class methods ---
+
+    Inst_SOPP__S_ICACHE_INV::Inst_SOPP__S_ICACHE_INV(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_icache_inv")
+    {
+    } // Inst_SOPP__S_ICACHE_INV
+
+    Inst_SOPP__S_ICACHE_INV::~Inst_SOPP__S_ICACHE_INV()
+    {
+    } // ~Inst_SOPP__S_ICACHE_INV
+
+    // --- description from .arch file ---
+    // Invalidate entire L1 instruction cache.
+    // You must have 12 separate S_NOP instructions or a jump/branch
+    // instruction after this instruction
+    // to ensure the SQ instruction buffer is purged.
+    void
+    Inst_SOPP__S_ICACHE_INV::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_INCPERFLEVEL class methods ---
+
+    Inst_SOPP__S_INCPERFLEVEL::Inst_SOPP__S_INCPERFLEVEL(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_incperflevel")
+    {
+    } // Inst_SOPP__S_INCPERFLEVEL
+
+    Inst_SOPP__S_INCPERFLEVEL::~Inst_SOPP__S_INCPERFLEVEL()
+    {
+    } // ~Inst_SOPP__S_INCPERFLEVEL
+
+    // --- description from .arch file ---
+    // Increment performance counter specified in SIMM16[3:0] by 1.
+    void
+    Inst_SOPP__S_INCPERFLEVEL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_DECPERFLEVEL class methods ---
+
+    Inst_SOPP__S_DECPERFLEVEL::Inst_SOPP__S_DECPERFLEVEL(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_decperflevel")
+    {
+    } // Inst_SOPP__S_DECPERFLEVEL
+
+    Inst_SOPP__S_DECPERFLEVEL::~Inst_SOPP__S_DECPERFLEVEL()
+    {
+    } // ~Inst_SOPP__S_DECPERFLEVEL
+
+    // --- description from .arch file ---
+    // Decrement performance counter specified in SIMM16[3:0] by 1.
+    void
+    Inst_SOPP__S_DECPERFLEVEL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_TTRACEDATA class methods ---
+
+    Inst_SOPP__S_TTRACEDATA::Inst_SOPP__S_TTRACEDATA(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_ttracedata")
+    {
+    } // Inst_SOPP__S_TTRACEDATA
+
+    Inst_SOPP__S_TTRACEDATA::~Inst_SOPP__S_TTRACEDATA()
+    {
+    } // ~Inst_SOPP__S_TTRACEDATA
+
+    // --- description from .arch file ---
+    // Send M0 as user data to the thread trace stream.
+    void
+    Inst_SOPP__S_TTRACEDATA::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_CDBGSYS class methods ---
+
+    Inst_SOPP__S_CBRANCH_CDBGSYS::Inst_SOPP__S_CBRANCH_CDBGSYS(
+          InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_cdbgsys")
+    {
+        setFlag(Branch);
+    } // Inst_SOPP__S_CBRANCH_CDBGSYS
+
+    Inst_SOPP__S_CBRANCH_CDBGSYS::~Inst_SOPP__S_CBRANCH_CDBGSYS()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_CDBGSYS
+
+    // --- description from .arch file ---
+    // if (conditional_debug_system != 0) then PC = PC + signext(SIMM16 * 4)
+    // + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_CDBGSYS::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_CDBGUSER class methods ---
+
+    Inst_SOPP__S_CBRANCH_CDBGUSER::Inst_SOPP__S_CBRANCH_CDBGUSER(
+          InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_cdbguser")
+    {
+        setFlag(Branch);
+    } // Inst_SOPP__S_CBRANCH_CDBGUSER
+
+    Inst_SOPP__S_CBRANCH_CDBGUSER::~Inst_SOPP__S_CBRANCH_CDBGUSER()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_CDBGUSER
+
+    // --- description from .arch file ---
+    // if (conditional_debug_user != 0) then PC = PC + signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_CDBGUSER::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER class methods ---
+
+    Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER(
+          InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_cdbgsys_or_user")
+    {
+        setFlag(Branch);
+    } // Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER
+
+    Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::
+        ~Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER
+
+    // --- description from .arch file ---
+    // if (conditional_debug_system || conditional_debug_user) then PC = PC +
+    // ---  signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER class methods ---
+
+    Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::
+        Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER(InFmt_SOPP *iFmt)
+            : Inst_SOPP(iFmt, "s_cbranch_cdbgsys_and_user")
+    {
+        setFlag(Branch);
+    } // Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER
+
+    Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::
+        ~Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER
+
+    // --- description from .arch file ---
+    // if (conditional_debug_system && conditional_debug_user) then PC = PC +
+    // ---  signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_ENDPGM_SAVED class methods ---
+
+    Inst_SOPP__S_ENDPGM_SAVED::Inst_SOPP__S_ENDPGM_SAVED(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_endpgm_saved")
+    {
+    } // Inst_SOPP__S_ENDPGM_SAVED
+
+    Inst_SOPP__S_ENDPGM_SAVED::~Inst_SOPP__S_ENDPGM_SAVED()
+    {
+    } // ~Inst_SOPP__S_ENDPGM_SAVED
+
+    // --- description from .arch file ---
+    // End of program; signal that a wave has been saved by the context-switch
+    // trap handler and terminate wavefront.
+    // The hardware implicitly executes S_WAITCNT 0 before executing this
+    // instruction.
+    // Use S_ENDPGM in all cases unless you are executing the context-switch
+    // save handler.
+    void
+    Inst_SOPP__S_ENDPGM_SAVED::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_SET_GPR_IDX_OFF class methods ---
+
+    Inst_SOPP__S_SET_GPR_IDX_OFF::Inst_SOPP__S_SET_GPR_IDX_OFF(
+          InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_set_gpr_idx_off")
+    {
+    } // Inst_SOPP__S_SET_GPR_IDX_OFF
+
+    Inst_SOPP__S_SET_GPR_IDX_OFF::~Inst_SOPP__S_SET_GPR_IDX_OFF()
+    {
+    } // ~Inst_SOPP__S_SET_GPR_IDX_OFF
+
+    // --- description from .arch file ---
+    // MODE.gpr_idx_en = 0.
+    // Clear GPR indexing mode. Vector operations after this will not perform
+    // ---  relative GPR addressing regardless of the contents of M0. This
+    // ---  instruction does not modify M0.
+    void
+    Inst_SOPP__S_SET_GPR_IDX_OFF::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_SET_GPR_IDX_MODE class methods ---
+
+    Inst_SOPP__S_SET_GPR_IDX_MODE::Inst_SOPP__S_SET_GPR_IDX_MODE(
+          InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_set_gpr_idx_mode")
+    {
+    } // Inst_SOPP__S_SET_GPR_IDX_MODE
+
+    Inst_SOPP__S_SET_GPR_IDX_MODE::~Inst_SOPP__S_SET_GPR_IDX_MODE()
+    {
+    } // ~Inst_SOPP__S_SET_GPR_IDX_MODE
+
+    // --- description from .arch file ---
+    // M0[15:12] = SIMM4.
+    // Modify the mode used for vector GPR indexing.
+    // The raw contents of the source field are read and used to set the enable
+    // bits. SIMM4[0] = VSRC0_REL, SIMM4[1] = VSRC1_REL, SIMM4[2] = VSRC2_REL
+    // and SIMM4[3] = VDST_REL.
+    void
+    Inst_SOPP__S_SET_GPR_IDX_MODE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/vinterp.cc b/src/arch/amdgpu/vega/insts/vinterp.cc
new file mode 100644
index 0000000000..784f6f2eb2
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/vinterp.cc
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_VINTRP__V_INTERP_P1_F32 class methods ---
+
+    Inst_VINTRP__V_INTERP_P1_F32::Inst_VINTRP__V_INTERP_P1_F32(
+          InFmt_VINTRP *iFmt)
+        : Inst_VINTRP(iFmt, "v_interp_p1_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VINTRP__V_INTERP_P1_F32
+
+    Inst_VINTRP__V_INTERP_P1_F32::~Inst_VINTRP__V_INTERP_P1_F32()
+    {
+    } // ~Inst_VINTRP__V_INTERP_P1_F32
+
+    // --- description from .arch file ---
+    // D.f = P10 * S.f + P0; parameter interpolation (SQ translates to
+    // V_MAD_F32 for SP).
+    // CAUTION: when in HALF_LDS mode, D must not be the same GPR as S;
+    // if D == S then data corruption will occur.
+    // NOTE: In textual representations the I/J VGPR is the first source and
+    // the attribute is the second source; however in the VOP3 encoding the
+    // attribute is stored in the src0 field and the VGPR is stored in the
+    // src1 field.
+    void
+    Inst_VINTRP__V_INTERP_P1_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VINTRP__V_INTERP_P2_F32 class methods ---
+
+    Inst_VINTRP__V_INTERP_P2_F32::Inst_VINTRP__V_INTERP_P2_F32(
+          InFmt_VINTRP *iFmt)
+        : Inst_VINTRP(iFmt, "v_interp_p2_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VINTRP__V_INTERP_P2_F32
+
+    Inst_VINTRP__V_INTERP_P2_F32::~Inst_VINTRP__V_INTERP_P2_F32()
+    {
+    } // ~Inst_VINTRP__V_INTERP_P2_F32
+
+    // --- description from .arch file ---
+    // D.f = P20 * S.f + D.f; parameter interpolation (SQ translates to
+    // V_MAD_F32 for SP).
+    // NOTE: In textual representations the I/J VGPR is the first source and
+    // the attribute is the second source; however in the VOP3 encoding the
+    // attribute is stored in the src0 field and the VGPR is stored in the
+    // src1 field.
+    void
+    Inst_VINTRP__V_INTERP_P2_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VINTRP__V_INTERP_MOV_F32 class methods ---
+
+    Inst_VINTRP__V_INTERP_MOV_F32::Inst_VINTRP__V_INTERP_MOV_F32(
+          InFmt_VINTRP *iFmt)
+        : Inst_VINTRP(iFmt, "v_interp_mov_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VINTRP__V_INTERP_MOV_F32
+
+    Inst_VINTRP__V_INTERP_MOV_F32::~Inst_VINTRP__V_INTERP_MOV_F32()
+    {
+    } // ~Inst_VINTRP__V_INTERP_MOV_F32
+
+    // --- description from .arch file ---
+    // D.f = {P10,P20,P0}[S.u]; parameter load.
+    void
+    Inst_VINTRP__V_INTERP_MOV_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/vop1.cc b/src/arch/amdgpu/vega/insts/vop1.cc
new file mode 100644
index 0000000000..fc41c0ae78
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/vop1.cc
@@ -0,0 +1,2340 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/inst_util.hh"
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_VOP1__V_NOP class methods ---
+
+    Inst_VOP1__V_NOP::Inst_VOP1__V_NOP(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_nop")
+    {
+        setFlag(Nop);
+        setFlag(ALU);
+    } // Inst_VOP1__V_NOP
+
+    Inst_VOP1__V_NOP::~Inst_VOP1__V_NOP()
+    {
+    } // ~Inst_VOP1__V_NOP
+
+    // --- description from .arch file ---
+    // Do nothing.
+    void
+    Inst_VOP1__V_NOP::execute(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_VOP1__V_MOV_B32 class methods ---
+
+    Inst_VOP1__V_MOV_B32::Inst_VOP1__V_MOV_B32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_mov_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_MOV_B32
+
+    Inst_VOP1__V_MOV_B32::~Inst_VOP1__V_MOV_B32()
+    {
+    } // ~Inst_VOP1__V_MOV_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u.
+    // Input and output modifiers not supported; this is an untyped operation.
+    void
+    Inst_VOP1__V_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (isDPPInst()) {
+            VecOperandU32 src_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
+            src_dpp.read();
+
+            DPRINTF(VEGA, "Handling V_MOV_B32 SRC DPP. SRC0: register v[%d], "
+                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
+                    "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
+                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
+                    extData.iFmt_VOP_DPP.DPP_CTRL,
+                    extData.iFmt_VOP_DPP.SRC0_ABS,
+                    extData.iFmt_VOP_DPP.SRC0_NEG,
+                    extData.iFmt_VOP_DPP.SRC1_ABS,
+                    extData.iFmt_VOP_DPP.SRC1_NEG,
+                    extData.iFmt_VOP_DPP.BC,
+                    extData.iFmt_VOP_DPP.BANK_MASK,
+                    extData.iFmt_VOP_DPP.ROW_MASK);
+
+            // NOTE: For VOP1, there is no SRC1, so make sure we're not trying
+            // to negate it or take the absolute value of it
+            assert(!extData.iFmt_VOP_DPP.SRC1_ABS);
+            assert(!extData.iFmt_VOP_DPP.SRC1_NEG);
+            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src_dpp);
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src_dpp[lane];
+                }
+            }
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_READFIRSTLANE_B32 class methods ---
+
+    Inst_VOP1__V_READFIRSTLANE_B32::Inst_VOP1__V_READFIRSTLANE_B32(
+          InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_readfirstlane_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_READFIRSTLANE_B32
+
+    Inst_VOP1__V_READFIRSTLANE_B32::~Inst_VOP1__V_READFIRSTLANE_B32()
+    {
+    } // ~Inst_VOP1__V_READFIRSTLANE_B32
+
+    // --- description from .arch file ---
+    // Copy one VGPR value to one SGPR. D = SGPR destination, S0 = source data
+    // (VGPR# or M0 for lds direct access), Lane# = FindFirst1fromLSB(exec)
+    // (Lane# = 0 if exec is zero). Ignores exec mask for the access. SQ
+    // translates to V_READLANE_B32.
+    // Input and output modifiers not supported; this is an untyped operation.
+    void
+    Inst_VOP1__V_READFIRSTLANE_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarRegI32 src_lane(0);
+        ScalarRegU64 exec_mask = wf->execMask().to_ullong();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (exec_mask) {
+            src_lane = findLsbSet(exec_mask);
+        }
+
+        sdst = src[src_lane];
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_I32_F64 class methods ---
+
+    Inst_VOP1__V_CVT_I32_F64::Inst_VOP1__V_CVT_I32_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_i32_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_CVT_I32_F64
+
+    Inst_VOP1__V_CVT_I32_F64::~Inst_VOP1__V_CVT_I32_F64()
+    {
+    } // ~Inst_VOP1__V_CVT_I32_F64
+
+    // --- description from .arch file ---
+    // D.i = (int)S0.d.
+    // Out-of-range floating point values (including infinity) saturate. NaN is
+    // ---  converted to 0.
+    void
+    Inst_VOP1__V_CVT_I32_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int exp;
+                std::frexp(src[lane],&exp);
+                if (std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else if (std::isinf(src[lane]) || exp > 30) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = INT_MIN;
+                    } else {
+                        vdst[lane] = INT_MAX;
+                    }
+                } else {
+                    vdst[lane] = (VecElemI32)src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F64_I32 class methods ---
+
+    Inst_VOP1__V_CVT_F64_I32::Inst_VOP1__V_CVT_F64_I32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f64_i32")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_CVT_F64_I32
+
+    Inst_VOP1__V_CVT_F64_I32::~Inst_VOP1__V_CVT_F64_I32()
+    {
+    } // ~Inst_VOP1__V_CVT_F64_I32
+
+    // --- description from .arch file ---
+    // D.d = (double)S0.i.
+    void
+    Inst_VOP1__V_CVT_F64_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF64)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F32_I32 class methods ---
+
+    Inst_VOP1__V_CVT_F32_I32::Inst_VOP1__V_CVT_F32_I32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f32_i32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_F32_I32
+
+    Inst_VOP1__V_CVT_F32_I32::~Inst_VOP1__V_CVT_F32_I32()
+    {
+    } // ~Inst_VOP1__V_CVT_F32_I32
+
+    // --- description from .arch file ---
+    // D.f = (float)S0.i.
+    void
+    Inst_VOP1__V_CVT_F32_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F32_U32 class methods ---
+
+    Inst_VOP1__V_CVT_F32_U32::Inst_VOP1__V_CVT_F32_U32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f32_u32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_F32_U32
+
+    Inst_VOP1__V_CVT_F32_U32::~Inst_VOP1__V_CVT_F32_U32()
+    {
+    } // ~Inst_VOP1__V_CVT_F32_U32
+
+    // --- description from .arch file ---
+    // D.f = (float)S0.u.
+    void
+    Inst_VOP1__V_CVT_F32_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_U32_F32 class methods ---
+
+    Inst_VOP1__V_CVT_U32_F32::Inst_VOP1__V_CVT_U32_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_u32_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_U32_F32
+
+    Inst_VOP1__V_CVT_U32_F32::~Inst_VOP1__V_CVT_U32_F32()
+    {
+    } // ~Inst_VOP1__V_CVT_U32_F32
+
+    // --- description from .arch file ---
+    // D.u = (unsigned)S0.f.
+    // Out-of-range floating point values (including infinity) saturate. NaN is
+    // ---  converted to 0.
+    void
+    Inst_VOP1__V_CVT_U32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int exp;
+                std::frexp(src[lane],&exp);
+                if (std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else if (std::isinf(src[lane])) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = 0;
+                    } else {
+                        vdst[lane] = UINT_MAX;
+                    }
+                } else if (exp > 31) {
+                    vdst[lane] = UINT_MAX;
+                } else {
+                    vdst[lane] = (VecElemU32)src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_I32_F32 class methods ---
+
+    Inst_VOP1__V_CVT_I32_F32::Inst_VOP1__V_CVT_I32_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_i32_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_I32_F32
+
+    Inst_VOP1__V_CVT_I32_F32::~Inst_VOP1__V_CVT_I32_F32()
+    {
+    } // ~Inst_VOP1__V_CVT_I32_F32
+
+    // --- description from .arch file ---
+    // D.i = (int)S0.f.
+    // Out-of-range floating point values (including infinity) saturate. NaN is
+    // ---  converted to 0.
+    void
+    Inst_VOP1__V_CVT_I32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int exp;
+                std::frexp(src[lane],&exp);
+                if (std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else if (std::isinf(src[lane]) || exp > 30) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = INT_MIN;
+                    } else {
+                        vdst[lane] = INT_MAX;
+                    }
+                } else {
+                    vdst[lane] = (VecElemI32)src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_MOV_FED_B32 class methods ---
+
+    Inst_VOP1__V_MOV_FED_B32::Inst_VOP1__V_MOV_FED_B32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_mov_fed_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_MOV_FED_B32
+
+    Inst_VOP1__V_MOV_FED_B32::~Inst_VOP1__V_MOV_FED_B32()
+    {
+    } // ~Inst_VOP1__V_MOV_FED_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u;
+    // Introduce EDC double error upon write to dest vgpr without causing an
+    // ---  exception.
+    // Input and output modifiers not supported; this is an untyped operation.
+    void
+    Inst_VOP1__V_MOV_FED_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F16_F32 class methods ---
+
+    Inst_VOP1__V_CVT_F16_F32::Inst_VOP1__V_CVT_F16_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f16_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_F16_F32
+
+    Inst_VOP1__V_CVT_F16_F32::~Inst_VOP1__V_CVT_F16_F32()
+    {
+    } // ~Inst_VOP1__V_CVT_F16_F32
+
+    // --- description from .arch file ---
+    // D.f16 = flt32_to_flt16(S0.f).
+    // Supports input modifiers and creates FP16 denormals when appropriate.
+    void
+    Inst_VOP1__V_CVT_F16_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F32_F16 class methods ---
+
+    Inst_VOP1__V_CVT_F32_F16::Inst_VOP1__V_CVT_F32_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f32_f16")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_F32_F16
+
+    Inst_VOP1__V_CVT_F32_F16::~Inst_VOP1__V_CVT_F32_F16()
+    {
+    } // ~Inst_VOP1__V_CVT_F32_F16
+
+    // --- description from .arch file ---
+    // D.f = flt16_to_flt32(S0.f16).
+    // FP16 denormal inputs are always accepted.
+    void
+    Inst_VOP1__V_CVT_F32_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_CVT_RPI_I32_F32 class methods ---
+
+    Inst_VOP1__V_CVT_RPI_I32_F32::Inst_VOP1__V_CVT_RPI_I32_F32(
+          InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_rpi_i32_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_RPI_I32_F32
+
+    Inst_VOP1__V_CVT_RPI_I32_F32::~Inst_VOP1__V_CVT_RPI_I32_F32()
+    {
+    } // ~Inst_VOP1__V_CVT_RPI_I32_F32
+
+    // --- description from .arch file ---
+    // D.i = (int)floor(S0.f + 0.5).
+    void
+    Inst_VOP1__V_CVT_RPI_I32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemI32)std::floor(src[lane] + 0.5);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_FLR_I32_F32 class methods ---
+
+    Inst_VOP1__V_CVT_FLR_I32_F32::Inst_VOP1__V_CVT_FLR_I32_F32(
+          InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_flr_i32_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_FLR_I32_F32
+
+    Inst_VOP1__V_CVT_FLR_I32_F32::~Inst_VOP1__V_CVT_FLR_I32_F32()
+    {
+    } // ~Inst_VOP1__V_CVT_FLR_I32_F32
+
+    // --- description from .arch file ---
+    // D.i = (int)floor(S0.f).
+    void
+    Inst_VOP1__V_CVT_FLR_I32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemI32)std::floor(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_OFF_F32_I4 class methods ---
+
+    Inst_VOP1__V_CVT_OFF_F32_I4::Inst_VOP1__V_CVT_OFF_F32_I4(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_off_f32_i4")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_OFF_F32_I4
+
+    Inst_VOP1__V_CVT_OFF_F32_I4::~Inst_VOP1__V_CVT_OFF_F32_I4()
+    {
+    } // ~Inst_VOP1__V_CVT_OFF_F32_I4
+
+    // --- description from .arch file ---
+    // 4-bit signed int to 32-bit float. Used for interpolation in shader.
+    void
+    Inst_VOP1__V_CVT_OFF_F32_I4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        // Could not parse sq_uc.arch desc field
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F32_F64 class methods ---
+
+    Inst_VOP1__V_CVT_F32_F64::Inst_VOP1__V_CVT_F32_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f32_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_CVT_F32_F64
+
+    Inst_VOP1__V_CVT_F32_F64::~Inst_VOP1__V_CVT_F32_F64()
+    {
+    } // ~Inst_VOP1__V_CVT_F32_F64
+
+    // --- description from .arch file ---
+    // D.f = (float)S0.d.
+    void
+    Inst_VOP1__V_CVT_F32_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F64_F32 class methods ---
+
+    Inst_VOP1__V_CVT_F64_F32::Inst_VOP1__V_CVT_F64_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f64_f32")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_CVT_F64_F32
+
+    Inst_VOP1__V_CVT_F64_F32::~Inst_VOP1__V_CVT_F64_F32()
+    {
+    } // ~Inst_VOP1__V_CVT_F64_F32
+
+    // --- description from .arch file ---
+    // D.d = (double)S0.f.
+    void
+    Inst_VOP1__V_CVT_F64_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF64)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F32_UBYTE0 class methods ---
+
+    Inst_VOP1__V_CVT_F32_UBYTE0::Inst_VOP1__V_CVT_F32_UBYTE0(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte0")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_F32_UBYTE0
+
+    Inst_VOP1__V_CVT_F32_UBYTE0::~Inst_VOP1__V_CVT_F32_UBYTE0()
+    {
+    } // ~Inst_VOP1__V_CVT_F32_UBYTE0
+
+    // --- description from .arch file ---
+    // D.f = (float)(S0.u[7:0]).
+    void
+    Inst_VOP1__V_CVT_F32_UBYTE0::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)(bits(src[lane], 7, 0));
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F32_UBYTE1 class methods ---
+
+    Inst_VOP1__V_CVT_F32_UBYTE1::Inst_VOP1__V_CVT_F32_UBYTE1(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte1")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_F32_UBYTE1
+
+    Inst_VOP1__V_CVT_F32_UBYTE1::~Inst_VOP1__V_CVT_F32_UBYTE1()
+    {
+    } // ~Inst_VOP1__V_CVT_F32_UBYTE1
+
+    // --- description from .arch file ---
+    // D.f = (float)(S0.u[15:8]).
+    void
+    Inst_VOP1__V_CVT_F32_UBYTE1::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)(bits(src[lane], 15, 8));
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F32_UBYTE2 class methods ---
+
+    Inst_VOP1__V_CVT_F32_UBYTE2::Inst_VOP1__V_CVT_F32_UBYTE2(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte2")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_F32_UBYTE2
+
+    Inst_VOP1__V_CVT_F32_UBYTE2::~Inst_VOP1__V_CVT_F32_UBYTE2()
+    {
+    } // ~Inst_VOP1__V_CVT_F32_UBYTE2
+
+    // --- description from .arch file ---
+    // D.f = (float)(S0.u[23:16]).
+    void
+    Inst_VOP1__V_CVT_F32_UBYTE2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)(bits(src[lane], 23, 16));
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F32_UBYTE3 class methods ---
+
+    Inst_VOP1__V_CVT_F32_UBYTE3::Inst_VOP1__V_CVT_F32_UBYTE3(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte3")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_F32_UBYTE3
+
+    Inst_VOP1__V_CVT_F32_UBYTE3::~Inst_VOP1__V_CVT_F32_UBYTE3()
+    {
+    } // ~Inst_VOP1__V_CVT_F32_UBYTE3
+
+    // --- description from .arch file ---
+    // D.f = (float)(S0.u[31:24]).
+    void
+    Inst_VOP1__V_CVT_F32_UBYTE3::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)(bits(src[lane], 31, 24));
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_U32_F64 class methods ---
+
+    Inst_VOP1__V_CVT_U32_F64::Inst_VOP1__V_CVT_U32_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_u32_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_CVT_U32_F64
+
+    Inst_VOP1__V_CVT_U32_F64::~Inst_VOP1__V_CVT_U32_F64()
+    {
+    } // ~Inst_VOP1__V_CVT_U32_F64
+
+    // --- description from .arch file ---
+    // D.u = (unsigned)S0.d.
+    // Out-of-range floating point values (including infinity) saturate. NaN is
+    // ---  converted to 0.
+    void
+    Inst_VOP1__V_CVT_U32_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int exp;
+                std::frexp(src[lane],&exp);
+                if (std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else if (std::isinf(src[lane])) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = 0;
+                    } else {
+                        vdst[lane] = UINT_MAX;
+                    }
+                } else if (exp > 31) {
+                    vdst[lane] = UINT_MAX;
+                } else {
+                    vdst[lane] = (VecElemU32)src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F64_U32 class methods ---
+
+    Inst_VOP1__V_CVT_F64_U32::Inst_VOP1__V_CVT_F64_U32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f64_u32")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_CVT_F64_U32
+
+    Inst_VOP1__V_CVT_F64_U32::~Inst_VOP1__V_CVT_F64_U32()
+    {
+    } // ~Inst_VOP1__V_CVT_F64_U32
+
+    // --- description from .arch file ---
+    // D.d = (double)S0.u.
+    void
+    Inst_VOP1__V_CVT_F64_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF64)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_TRUNC_F64 class methods ---
+
+    Inst_VOP1__V_TRUNC_F64::Inst_VOP1__V_TRUNC_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_trunc_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_TRUNC_F64
+
+    Inst_VOP1__V_TRUNC_F64::~Inst_VOP1__V_TRUNC_F64()
+    {
+    } // ~Inst_VOP1__V_TRUNC_F64
+
+    // --- description from .arch file ---
+    // D.d = trunc(S0.d), return integer part of S0.d.
+    void
+    Inst_VOP1__V_TRUNC_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::trunc(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CEIL_F64 class methods ---
+
+    Inst_VOP1__V_CEIL_F64::Inst_VOP1__V_CEIL_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_ceil_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_CEIL_F64
+
+    Inst_VOP1__V_CEIL_F64::~Inst_VOP1__V_CEIL_F64()
+    {
+    } // ~Inst_VOP1__V_CEIL_F64
+
+    // --- description from .arch file ---
+    // D.d = trunc(S0.d);
+    // if (S0.d > 0.0 && S0.d != D.d) then D.d += 1.0.
+    void
+    Inst_VOP1__V_CEIL_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::ceil(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_RNDNE_F64 class methods ---
+
+    Inst_VOP1__V_RNDNE_F64::Inst_VOP1__V_RNDNE_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rndne_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_RNDNE_F64
+
+    Inst_VOP1__V_RNDNE_F64::~Inst_VOP1__V_RNDNE_F64()
+    {
+    } // ~Inst_VOP1__V_RNDNE_F64
+
+    // --- description from .arch file ---
+    // D.d = round_nearest_even(S0.d).
+    void
+    Inst_VOP1__V_RNDNE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = roundNearestEven(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FLOOR_F64 class methods ---
+
+    Inst_VOP1__V_FLOOR_F64::Inst_VOP1__V_FLOOR_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_floor_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_FLOOR_F64
+
+    Inst_VOP1__V_FLOOR_F64::~Inst_VOP1__V_FLOOR_F64()
+    {
+    } // ~Inst_VOP1__V_FLOOR_F64
+
+    // --- description from .arch file ---
+    // D.d = trunc(S0.d);
+    // if (S0.d < 0.0 && S0.d != D.d) then D.d += -1.0.
+    void
+    Inst_VOP1__V_FLOOR_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::floor(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FRACT_F32 class methods ---
+
+    Inst_VOP1__V_FRACT_F32::Inst_VOP1__V_FRACT_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_fract_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_FRACT_F32
+
+    Inst_VOP1__V_FRACT_F32::~Inst_VOP1__V_FRACT_F32()
+    {
+    } // ~Inst_VOP1__V_FRACT_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f - floor(S0.f).
+    void
+    Inst_VOP1__V_FRACT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemF32 int_part(0.0);
+                vdst[lane] = std::modf(src[lane], &int_part);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_TRUNC_F32 class methods ---
+
+    Inst_VOP1__V_TRUNC_F32::Inst_VOP1__V_TRUNC_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_trunc_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_TRUNC_F32
+
+    Inst_VOP1__V_TRUNC_F32::~Inst_VOP1__V_TRUNC_F32()
+    {
+    } // ~Inst_VOP1__V_TRUNC_F32
+
+    // --- description from .arch file ---
+    // D.f = trunc(S0.f), return integer part of S0.f.
+    void
+    Inst_VOP1__V_TRUNC_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst (gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::trunc(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CEIL_F32 class methods ---
+
+    Inst_VOP1__V_CEIL_F32::Inst_VOP1__V_CEIL_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_ceil_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CEIL_F32
+
+    Inst_VOP1__V_CEIL_F32::~Inst_VOP1__V_CEIL_F32()
+    {
+    } // ~Inst_VOP1__V_CEIL_F32
+
+    // --- description from .arch file ---
+    // D.f = trunc(S0.f);
+    // if (S0.f > 0.0 && S0.f != D.f) then D.f += 1.0.
+    void
+    Inst_VOP1__V_CEIL_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::ceil(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_RNDNE_F32 class methods ---
+
+    Inst_VOP1__V_RNDNE_F32::Inst_VOP1__V_RNDNE_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rndne_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_RNDNE_F32
+
+    Inst_VOP1__V_RNDNE_F32::~Inst_VOP1__V_RNDNE_F32()
+    {
+    } // ~Inst_VOP1__V_RNDNE_F32
+
+    // --- description from .arch file ---
+    // D.f = round_nearest_even(S0.f).
+    void
+    Inst_VOP1__V_RNDNE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = roundNearestEven(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FLOOR_F32 class methods ---
+
+    Inst_VOP1__V_FLOOR_F32::Inst_VOP1__V_FLOOR_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_floor_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_FLOOR_F32
+
+    Inst_VOP1__V_FLOOR_F32::~Inst_VOP1__V_FLOOR_F32()
+    {
+    } // ~Inst_VOP1__V_FLOOR_F32
+
+    // --- description from .arch file ---
+    // D.f = trunc(S0.f);
+    // if (S0.f < 0.0 && S0.f != D.f) then D.f += -1.0.
+    void
+    Inst_VOP1__V_FLOOR_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::floor(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_EXP_F32 class methods ---
+
+    Inst_VOP1__V_EXP_F32::Inst_VOP1__V_EXP_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_exp_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_EXP_F32
+
+    Inst_VOP1__V_EXP_F32::~Inst_VOP1__V_EXP_F32()
+    {
+    } // ~Inst_VOP1__V_EXP_F32
+
+    // --- description from .arch file ---
+    // D.f = pow(2.0, S0.f).
+    void
+    Inst_VOP1__V_EXP_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::pow(2.0, src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_LOG_F32 class methods ---
+
+    Inst_VOP1__V_LOG_F32::Inst_VOP1__V_LOG_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_log_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_LOG_F32
+
+    Inst_VOP1__V_LOG_F32::~Inst_VOP1__V_LOG_F32()
+    {
+    } // ~Inst_VOP1__V_LOG_F32
+
+    // --- description from .arch file ---
+    // D.f = log2(S0.f). Base 2 logarithm.
+    void
+    Inst_VOP1__V_LOG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::log2(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_RCP_F32 class methods ---
+
+    Inst_VOP1__V_RCP_F32::Inst_VOP1__V_RCP_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rcp_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_RCP_F32
+
+    Inst_VOP1__V_RCP_F32::~Inst_VOP1__V_RCP_F32()
+    {
+    } // ~Inst_VOP1__V_RCP_F32
+
+    // --- description from .arch file ---
+    // D.f = 1.0 / S0.f. Reciprocal with IEEE rules and < 1ulp error.
+    void
+    Inst_VOP1__V_RCP_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = 1.0 / src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_RCP_IFLAG_F32 class methods ---
+
+    Inst_VOP1__V_RCP_IFLAG_F32::Inst_VOP1__V_RCP_IFLAG_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rcp_iflag_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_RCP_IFLAG_F32
+
+    Inst_VOP1__V_RCP_IFLAG_F32::~Inst_VOP1__V_RCP_IFLAG_F32()
+    {
+    } // ~Inst_VOP1__V_RCP_IFLAG_F32
+
+    // --- description from .arch file ---
+    // D.f = 1.0 / S0.f. Reciprocal intended for integer division, can raise
+    // ---  integer DIV_BY_ZERO exception but cannot raise floating-point
+    // ---  exceptions.
+    void
+    Inst_VOP1__V_RCP_IFLAG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = 1.0 / src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_RSQ_F32 class methods ---
+
+    Inst_VOP1__V_RSQ_F32::Inst_VOP1__V_RSQ_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rsq_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_RSQ_F32
+
+    Inst_VOP1__V_RSQ_F32::~Inst_VOP1__V_RSQ_F32()
+    {
+    } // ~Inst_VOP1__V_RSQ_F32
+
+    // --- description from .arch file ---
+    // D.f = 1.0 / sqrt(S0.f). Reciprocal square root with IEEE rules.
+    void
+    Inst_VOP1__V_RSQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = 1.0 / std::sqrt(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_RCP_F64 class methods ---
+
+    Inst_VOP1__V_RCP_F64::Inst_VOP1__V_RCP_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rcp_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_RCP_F64
+
+    Inst_VOP1__V_RCP_F64::~Inst_VOP1__V_RCP_F64()
+    {
+    } // ~Inst_VOP1__V_RCP_F64
+
+    // --- description from .arch file ---
+    // D.d = 1.0 / S0.d.
+    void
+    Inst_VOP1__V_RCP_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::fpclassify(src[lane]) == FP_ZERO) {
+                    vdst[lane] = +INFINITY;
+                } else if (std::isnan(src[lane])) {
+                    vdst[lane] = NAN;
+                } else if (std::isinf(src[lane])) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = -0.0;
+                    } else {
+                        vdst[lane] = 0.0;
+                    }
+                } else {
+                    vdst[lane] = 1.0 / src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_RSQ_F64 class methods ---
+
+    Inst_VOP1__V_RSQ_F64::Inst_VOP1__V_RSQ_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rsq_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_RSQ_F64
+
+    Inst_VOP1__V_RSQ_F64::~Inst_VOP1__V_RSQ_F64()
+    {
+    } // ~Inst_VOP1__V_RSQ_F64
+
+    // --- description from .arch file ---
+    // D.d = 1.0 / sqrt(S0.d). See V_RSQ_F32.
+    void
+    Inst_VOP1__V_RSQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::fpclassify(src[lane]) == FP_ZERO) {
+                    vdst[lane] = +INFINITY;
+                } else if (std::isnan(src[lane])) {
+                    vdst[lane] = NAN;
+                } else if (std::isinf(src[lane])
+                           && !std::signbit(src[lane])) {
+                    vdst[lane] = 0.0;
+                } else if (std::signbit(src[lane])) {
+                    vdst[lane] = NAN;
+                } else {
+                    vdst[lane] = 1.0 / std::sqrt(src[lane]);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_SQRT_F32 class methods ---
+
+    Inst_VOP1__V_SQRT_F32::Inst_VOP1__V_SQRT_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_sqrt_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_SQRT_F32
+
+    Inst_VOP1__V_SQRT_F32::~Inst_VOP1__V_SQRT_F32()
+    {
+    } // ~Inst_VOP1__V_SQRT_F32
+
+    // --- description from .arch file ---
+    // D.f = sqrt(S0.f).
+    void
+    Inst_VOP1__V_SQRT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::sqrt(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_SQRT_F64 class methods ---
+
+    Inst_VOP1__V_SQRT_F64::Inst_VOP1__V_SQRT_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_sqrt_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_SQRT_F64
+
+    Inst_VOP1__V_SQRT_F64::~Inst_VOP1__V_SQRT_F64()
+    {
+    } // ~Inst_VOP1__V_SQRT_F64
+
+    // --- description from .arch file ---
+    // D.d = sqrt(S0.d).
+    void
+    Inst_VOP1__V_SQRT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::sqrt(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_SIN_F32 class methods ---
+
+    Inst_VOP1__V_SIN_F32::Inst_VOP1__V_SIN_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_sin_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_SIN_F32
+
+    Inst_VOP1__V_SIN_F32::~Inst_VOP1__V_SIN_F32()
+    {
+    } // ~Inst_VOP1__V_SIN_F32
+
+    // --- description from .arch file ---
+    // D.f = sin(S0.f * 2 * PI).
+    // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
+    // float 0.0.
+    void
+    Inst_VOP1__V_SIN_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+        pi.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (src[lane] < -256.0 || src[lane] > 256.0) {
+                    vdst[lane] = 0.0;
+                } else {
+                    vdst[lane] = std::sin(src[lane] * 2.0 * pi.rawData());
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_COS_F32 class methods ---
+
+    Inst_VOP1__V_COS_F32::Inst_VOP1__V_COS_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cos_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_COS_F32
+
+    Inst_VOP1__V_COS_F32::~Inst_VOP1__V_COS_F32()
+    {
+    } // ~Inst_VOP1__V_COS_F32
+
+    // --- description from .arch file ---
+    // D.f = cos(S0.f * 2 * PI).
+    // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
+    // float 1.0.
+    void
+    Inst_VOP1__V_COS_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+        pi.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (src[lane] < -256.0 || src[lane] > 256.0) {
+                    vdst[lane] = 0.0;
+                } else {
+                    vdst[lane] = std::cos(src[lane] * 2.0 * pi.rawData());
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_NOT_B32 class methods ---
+
+    Inst_VOP1__V_NOT_B32::Inst_VOP1__V_NOT_B32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_not_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_NOT_B32
+
+    Inst_VOP1__V_NOT_B32::~Inst_VOP1__V_NOT_B32()
+    {
+    } // ~Inst_VOP1__V_NOT_B32
+
+    // --- description from .arch file ---
+    // D.u = ~S0.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP1__V_NOT_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = ~src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_BFREV_B32 class methods ---
+
+    Inst_VOP1__V_BFREV_B32::Inst_VOP1__V_BFREV_B32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_bfrev_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_BFREV_B32
+
+    Inst_VOP1__V_BFREV_B32::~Inst_VOP1__V_BFREV_B32()
+    {
+    } // ~Inst_VOP1__V_BFREV_B32
+
+    // --- description from .arch file ---
+    // D.u[31:0] = S0.u[0:31], bitfield reverse.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP1__V_BFREV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = reverseBits(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FFBH_U32 class methods ---
+
+    Inst_VOP1__V_FFBH_U32::Inst_VOP1__V_FFBH_U32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_ffbh_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_FFBH_U32
+
+    Inst_VOP1__V_FFBH_U32::~Inst_VOP1__V_FFBH_U32()
+    {
+    } // ~Inst_VOP1__V_FFBH_U32
+
+    // --- description from .arch file ---
+    // D.u = position of first 1 in S0.u from MSB;
+    // D.u = 0xffffffff if S0.u == 0.
+    void
+    Inst_VOP1__V_FFBH_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = findFirstOneMsb(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FFBL_B32 class methods ---
+
+    Inst_VOP1__V_FFBL_B32::Inst_VOP1__V_FFBL_B32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_ffbl_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_FFBL_B32
+
+    Inst_VOP1__V_FFBL_B32::~Inst_VOP1__V_FFBL_B32()
+    {
+    } // ~Inst_VOP1__V_FFBL_B32
+
+    // --- description from .arch file ---
+    // D.u = position of first 1 in S0.u from LSB;
+    // D.u = 0xffffffff if S0.u == 0.
+    void
+    Inst_VOP1__V_FFBL_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = findFirstOne(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FFBH_I32 class methods ---
+
+    Inst_VOP1__V_FFBH_I32::Inst_VOP1__V_FFBH_I32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_ffbh_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_FFBH_I32
+
+    Inst_VOP1__V_FFBH_I32::~Inst_VOP1__V_FFBH_I32()
+    {
+    } // ~Inst_VOP1__V_FFBH_I32
+
+    // --- description from .arch file ---
+    // D.u = position of first bit different from sign bit in S0.i from MSB;
+    // D.u = 0xffffffff if S0.i == 0 or S0.i == 0xffffffff.
+    void
+    Inst_VOP1__V_FFBH_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = firstOppositeSignBit(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FREXP_EXP_I32_F64 class methods ---
+
+    Inst_VOP1__V_FREXP_EXP_I32_F64::Inst_VOP1__V_FREXP_EXP_I32_F64(
+          InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_frexp_exp_i32_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_FREXP_EXP_I32_F64
+
+    Inst_VOP1__V_FREXP_EXP_I32_F64::~Inst_VOP1__V_FREXP_EXP_I32_F64()
+    {
+    } // ~Inst_VOP1__V_FREXP_EXP_I32_F64
+
+    // --- description from .arch file ---
+    // See V_FREXP_EXP_I32_F32.
+    void
+    Inst_VOP1__V_FREXP_EXP_I32_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else {
+                    VecElemI32 exp = 0;
+                    std::frexp(src[lane], &exp);
+                    vdst[lane] = exp;
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FREXP_MANT_F64 class methods ---
+
+    Inst_VOP1__V_FREXP_MANT_F64::Inst_VOP1__V_FREXP_MANT_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_frexp_mant_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_FREXP_MANT_F64
+
+    Inst_VOP1__V_FREXP_MANT_F64::~Inst_VOP1__V_FREXP_MANT_F64()
+    {
+    } // ~Inst_VOP1__V_FREXP_MANT_F64
+
+    // --- description from .arch file ---
+    // See V_FREXP_MANT_F32.
+    void
+    Inst_VOP1__V_FREXP_MANT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
+                    vdst[lane] = src[lane];
+                } else {
+                    VecElemI32 exp(0);
+                    vdst[lane] = std::frexp(src[lane], &exp);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FRACT_F64 class methods ---
+
+    Inst_VOP1__V_FRACT_F64::Inst_VOP1__V_FRACT_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_fract_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_FRACT_F64
+
+    Inst_VOP1__V_FRACT_F64::~Inst_VOP1__V_FRACT_F64()
+    {
+    } // ~Inst_VOP1__V_FRACT_F64
+
+    // --- description from .arch file ---
+    // See V_FRACT_F32.
+    void
+    Inst_VOP1__V_FRACT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemF64 int_part(0.0);
+                vdst[lane] = std::modf(src[lane], &int_part);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FREXP_EXP_I32_F32 class methods ---
+
+    Inst_VOP1__V_FREXP_EXP_I32_F32::Inst_VOP1__V_FREXP_EXP_I32_F32(
+          InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_frexp_exp_i32_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_FREXP_EXP_I32_F32
+
+    Inst_VOP1__V_FREXP_EXP_I32_F32::~Inst_VOP1__V_FREXP_EXP_I32_F32()
+    {
+    } // ~Inst_VOP1__V_FREXP_EXP_I32_F32
+
+    // --- description from .arch file ---
+    // if (S0.f == INF || S0.f == NAN) then D.i = 0;
+    // else D.i = TwosComplement(Exponent(S0.f) - 127 + 1).
+    // Returns exponent of single precision float input, such that S0.f =
+    // significand * (2 ** exponent). See also FREXP_MANT_F32, which returns
+    // the significand.
+    void
+    Inst_VOP1__V_FREXP_EXP_I32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else {
+                    VecElemI32 exp(0);
+                    std::frexp(src[lane], &exp);
+                    vdst[lane] = exp;
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FREXP_MANT_F32 class methods ---
+
+    Inst_VOP1__V_FREXP_MANT_F32::Inst_VOP1__V_FREXP_MANT_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_frexp_mant_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_FREXP_MANT_F32
+
+    Inst_VOP1__V_FREXP_MANT_F32::~Inst_VOP1__V_FREXP_MANT_F32()
+    {
+    } // ~Inst_VOP1__V_FREXP_MANT_F32
+
+    // --- description from .arch file ---
+    // if (S0.f == INF || S0.f == NAN) then D.f = S0.f;
+    // else D.f = Mantissa(S0.f).
+    // Result range is in (-1.0,-0.5][0.5,1.0) in normal cases. Returns binary
+    // ---  significand of single precision float input, such that S0.f =
+    // ---  significand * (2 ** exponent). See also FREXP_EXP_I32_F32, which
+    // ---  returns integer exponent.
+    void
+    Inst_VOP1__V_FREXP_MANT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
+                    vdst[lane] = src[lane];
+                } else {
+                    VecElemI32 exp(0);
+                    vdst[lane] = std::frexp(src[lane], &exp);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CLREXCP class methods ---
+
+    Inst_VOP1__V_CLREXCP::Inst_VOP1__V_CLREXCP(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_clrexcp")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_CLREXCP
+
+    Inst_VOP1__V_CLREXCP::~Inst_VOP1__V_CLREXCP()
+    {
+    } // ~Inst_VOP1__V_CLREXCP
+
+    // --- description from .arch file ---
+    // Clear wave's exception state in SIMD (SP).
+    void
+    Inst_VOP1__V_CLREXCP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F16_U16 class methods ---
+
+    Inst_VOP1__V_CVT_F16_U16::Inst_VOP1__V_CVT_F16_U16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f16_u16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_CVT_F16_U16
+
+    Inst_VOP1__V_CVT_F16_U16::~Inst_VOP1__V_CVT_F16_U16()
+    {
+    } // ~Inst_VOP1__V_CVT_F16_U16
+
+    // --- description from .arch file ---
+    // D.f16 = uint16_to_flt16(S.u16).
+    // Supports denormals, rounding, exception flags and saturation.
+    void
+    Inst_VOP1__V_CVT_F16_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F16_I16 class methods ---
+
+    Inst_VOP1__V_CVT_F16_I16::Inst_VOP1__V_CVT_F16_I16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f16_i16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_CVT_F16_I16
+
+    Inst_VOP1__V_CVT_F16_I16::~Inst_VOP1__V_CVT_F16_I16()
+    {
+    } // ~Inst_VOP1__V_CVT_F16_I16
+
+    // --- description from .arch file ---
+    // D.f16 = int16_to_flt16(S.i16).
+    // Supports denormals, rounding, exception flags and saturation.
+    void
+    Inst_VOP1__V_CVT_F16_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_CVT_U16_F16 class methods ---
+
+    Inst_VOP1__V_CVT_U16_F16::Inst_VOP1__V_CVT_U16_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_u16_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_CVT_U16_F16
+
+    Inst_VOP1__V_CVT_U16_F16::~Inst_VOP1__V_CVT_U16_F16()
+    {
+    } // ~Inst_VOP1__V_CVT_U16_F16
+
+    // --- description from .arch file ---
+    // D.u16 = flt16_to_uint16(S.f16).
+    // Supports rounding, exception flags and saturation.
+    void
+    Inst_VOP1__V_CVT_U16_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_CVT_I16_F16 class methods ---
+
+    Inst_VOP1__V_CVT_I16_F16::Inst_VOP1__V_CVT_I16_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_i16_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_CVT_I16_F16
+
+    Inst_VOP1__V_CVT_I16_F16::~Inst_VOP1__V_CVT_I16_F16()
+    {
+    } // ~Inst_VOP1__V_CVT_I16_F16
+
+    // --- description from .arch file ---
+    // D.i16 = flt16_to_int16(S.f16).
+    // Supports rounding, exception flags and saturation.
+    void
+    Inst_VOP1__V_CVT_I16_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_RCP_F16 class methods ---
+
+    Inst_VOP1__V_RCP_F16::Inst_VOP1__V_RCP_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rcp_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_RCP_F16
+
+    Inst_VOP1__V_RCP_F16::~Inst_VOP1__V_RCP_F16()
+    {
+    } // ~Inst_VOP1__V_RCP_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 1.0f)
+    //     D.f16 = 1.0f;
+    // else
+    //     D.f16 = ApproximateRecip(S0.f16).
+    void
+    Inst_VOP1__V_RCP_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_SQRT_F16 class methods ---
+
+    Inst_VOP1__V_SQRT_F16::Inst_VOP1__V_SQRT_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_sqrt_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_SQRT_F16
+
+    Inst_VOP1__V_SQRT_F16::~Inst_VOP1__V_SQRT_F16()
+    {
+    } // ~Inst_VOP1__V_SQRT_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 1.0f)
+    //     D.f16 = 1.0f;
+    // else
+    //     D.f16 = ApproximateSqrt(S0.f16).
+    void
+    Inst_VOP1__V_SQRT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_RSQ_F16 class methods ---
+
+    Inst_VOP1__V_RSQ_F16::Inst_VOP1__V_RSQ_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rsq_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_RSQ_F16
+
+    Inst_VOP1__V_RSQ_F16::~Inst_VOP1__V_RSQ_F16()
+    {
+    } // ~Inst_VOP1__V_RSQ_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 1.0f)
+    //     D.f16 = 1.0f;
+    // else
+    //     D.f16 = ApproximateRecipSqrt(S0.f16).
+    void
+    Inst_VOP1__V_RSQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_LOG_F16 class methods ---
+
+    Inst_VOP1__V_LOG_F16::Inst_VOP1__V_LOG_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_log_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_LOG_F16
+
+    Inst_VOP1__V_LOG_F16::~Inst_VOP1__V_LOG_F16()
+    {
+    } // ~Inst_VOP1__V_LOG_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 1.0f)
+    //     D.f16 = 0.0f;
+    // else
+    //     D.f16 = ApproximateLog2(S0.f16).
+    void
+    Inst_VOP1__V_LOG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_EXP_F16 class methods ---
+
+    Inst_VOP1__V_EXP_F16::Inst_VOP1__V_EXP_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_exp_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_EXP_F16
+
+    Inst_VOP1__V_EXP_F16::~Inst_VOP1__V_EXP_F16()
+    {
+    } // ~Inst_VOP1__V_EXP_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 0.0f)
+    //     D.f16 = 1.0f;
+    // else
+    //     D.f16 = Approximate2ToX(S0.f16).
+    void
+    Inst_VOP1__V_EXP_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_FREXP_MANT_F16 class methods ---
+
+    Inst_VOP1__V_FREXP_MANT_F16::Inst_VOP1__V_FREXP_MANT_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_frexp_mant_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_FREXP_MANT_F16
+
+    Inst_VOP1__V_FREXP_MANT_F16::~Inst_VOP1__V_FREXP_MANT_F16()
+    {
+    } // ~Inst_VOP1__V_FREXP_MANT_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == +-INF || S0.f16 == NAN)
+    //     D.f16 = S0.f16;
+    // else
+    //     D.f16 = mantissa(S0.f16).
+    // Result range is (-1.0,-0.5][0.5,1.0).
+    // C math library frexp function.
+    // Returns binary significand of half precision float input, such that the
+    // original single float = significand * (2 ** exponent).
+    void
+    Inst_VOP1__V_FREXP_MANT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_FREXP_EXP_I16_F16 class methods ---
+
+    Inst_VOP1__V_FREXP_EXP_I16_F16::Inst_VOP1__V_FREXP_EXP_I16_F16(
+          InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_frexp_exp_i16_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_FREXP_EXP_I16_F16
+
+    Inst_VOP1__V_FREXP_EXP_I16_F16::~Inst_VOP1__V_FREXP_EXP_I16_F16()
+    {
+    } // ~Inst_VOP1__V_FREXP_EXP_I16_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == +-INF || S0.f16 == NAN)
+    //     D.i16 = 0;
+    // else
+    //     D.i16 = 2s_complement(exponent(S0.f16) - 15 + 1).
+    // C math library frexp function.
+    // Returns exponent of half precision float input, such that the
+    // original single float = significand * (2 ** exponent).
+    void
+    Inst_VOP1__V_FREXP_EXP_I16_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_FLOOR_F16 class methods ---
+
+    Inst_VOP1__V_FLOOR_F16::Inst_VOP1__V_FLOOR_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_floor_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_FLOOR_F16
+
+    Inst_VOP1__V_FLOOR_F16::~Inst_VOP1__V_FLOOR_F16()
+    {
+    } // ~Inst_VOP1__V_FLOOR_F16
+
+    // --- description from .arch file ---
+    // D.f16 = trunc(S0.f16);
+    // if (S0.f16 < 0.0f && S0.f16 != D.f16) then D.f16 -= 1.0f.
+    void
+    Inst_VOP1__V_FLOOR_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_CEIL_F16 class methods ---
+
+    Inst_VOP1__V_CEIL_F16::Inst_VOP1__V_CEIL_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_ceil_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_CEIL_F16
+
+    Inst_VOP1__V_CEIL_F16::~Inst_VOP1__V_CEIL_F16()
+    {
+    } // ~Inst_VOP1__V_CEIL_F16
+
+    // --- description from .arch file ---
+    // D.f16 = trunc(S0.f16);
+    // if (S0.f16 > 0.0f && S0.f16 != D.f16) then D.f16 += 1.0f.
+    void
+    Inst_VOP1__V_CEIL_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_TRUNC_F16 class methods ---
+
+    Inst_VOP1__V_TRUNC_F16::Inst_VOP1__V_TRUNC_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_trunc_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_TRUNC_F16
+
+    Inst_VOP1__V_TRUNC_F16::~Inst_VOP1__V_TRUNC_F16()
+    {
+    } // ~Inst_VOP1__V_TRUNC_F16
+
+    // --- description from .arch file ---
+    // D.f16 = trunc(S0.f16).
+    // Round-to-zero semantics.
+    void
+    Inst_VOP1__V_TRUNC_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_RNDNE_F16 class methods ---
+
+    Inst_VOP1__V_RNDNE_F16::Inst_VOP1__V_RNDNE_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rndne_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_RNDNE_F16
+
+    Inst_VOP1__V_RNDNE_F16::~Inst_VOP1__V_RNDNE_F16()
+    {
+    } // ~Inst_VOP1__V_RNDNE_F16
+
+    // --- description from .arch file ---
+    // D.f16 = FLOOR(S0.f16 + 0.5f);
+    // if (floor(S0.f16) is even && fract(S0.f16) == 0.5f) then D.f16 -= 1.0f.
+    // Round-to-nearest-even semantics.
+    void
+    Inst_VOP1__V_RNDNE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_FRACT_F16 class methods ---
+
+    Inst_VOP1__V_FRACT_F16::Inst_VOP1__V_FRACT_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_fract_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_FRACT_F16
+
+    Inst_VOP1__V_FRACT_F16::~Inst_VOP1__V_FRACT_F16()
+    {
+    } // ~Inst_VOP1__V_FRACT_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 + -floor(S0.f16).
+    void
+    Inst_VOP1__V_FRACT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_SIN_F16 class methods ---
+
+    Inst_VOP1__V_SIN_F16::Inst_VOP1__V_SIN_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_sin_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_SIN_F16
+
+    Inst_VOP1__V_SIN_F16::~Inst_VOP1__V_SIN_F16()
+    {
+    } // ~Inst_VOP1__V_SIN_F16
+
+    // --- description from .arch file ---
+    // D.f16 = sin(S0.f16 * 2 * PI).
+    void
+    Inst_VOP1__V_SIN_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_COS_F16 class methods ---
+
+    Inst_VOP1__V_COS_F16::Inst_VOP1__V_COS_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cos_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_COS_F16
+
+    Inst_VOP1__V_COS_F16::~Inst_VOP1__V_COS_F16()
+    {
+    } // ~Inst_VOP1__V_COS_F16
+
+    // --- description from .arch file ---
+    // D.f16 = cos(S0.f16 * 2 * PI).
+    void
+    Inst_VOP1__V_COS_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_EXP_LEGACY_F32 class methods ---
+
+    Inst_VOP1__V_EXP_LEGACY_F32::Inst_VOP1__V_EXP_LEGACY_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_exp_legacy_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_EXP_LEGACY_F32
+
+    Inst_VOP1__V_EXP_LEGACY_F32::~Inst_VOP1__V_EXP_LEGACY_F32()
+    {
+    } // ~Inst_VOP1__V_EXP_LEGACY_F32
+
+    // --- description from .arch file ---
+    // D.f = pow(2.0, S0.f) with legacy semantics.
+    void
+    Inst_VOP1__V_EXP_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::pow(2.0, src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_LOG_LEGACY_F32 class methods ---
+
+    Inst_VOP1__V_LOG_LEGACY_F32::Inst_VOP1__V_LOG_LEGACY_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_log_legacy_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_LOG_LEGACY_F32
+
+    Inst_VOP1__V_LOG_LEGACY_F32::~Inst_VOP1__V_LOG_LEGACY_F32()
+    {
+    } // ~Inst_VOP1__V_LOG_LEGACY_F32
+
+    // --- description from .arch file ---
+    // D.f = log2(S0.f). Base 2 logarithm with legacy semantics.
+    void
+    Inst_VOP1__V_LOG_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::log2(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/vop2.cc b/src/arch/amdgpu/vega/insts/vop2.cc
new file mode 100644
index 0000000000..ddd77e27da
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/vop2.cc
@@ -0,0 +1,2187 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/inst_util.hh"
+#include "arch/amdgpu/vega/insts/instructions.hh"
+#include "debug/VEGA.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_VOP2__V_CNDMASK_B32 class methods ---
+
+    Inst_VOP2__V_CNDMASK_B32::Inst_VOP2__V_CNDMASK_B32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_cndmask_b32")
+    {
+        setFlag(ALU);
+        setFlag(ReadsVCC);
+    } // Inst_VOP2__V_CNDMASK_B32
+
+    Inst_VOP2__V_CNDMASK_B32::~Inst_VOP2__V_CNDMASK_B32()
+    {
+    } // ~Inst_VOP2__V_CNDMASK_B32
+
+    // --- description from .arch file ---
+    // D.u = (VCC[i] ? S1.u : S0.u) (i = threadID in wave); VOP3: specify VCC
+    // as a scalar GPR in S2.
+    void
+    Inst_VOP2__V_CNDMASK_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+        vcc.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane]
+                    = bits(vcc.rawData(), lane) ? src1[lane] : src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_ADD_F32 class methods ---
+
+    Inst_VOP2__V_ADD_F32::Inst_VOP2__V_ADD_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_add_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP2__V_ADD_F32
+
+    Inst_VOP2__V_ADD_F32::~Inst_VOP2__V_ADD_F32()
+    {
+    } // ~Inst_VOP2__V_ADD_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f + S1.f.
+    void
+    Inst_VOP2__V_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        VecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        if (isDPPInst()) {
+            VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
+            src0_dpp.read();
+
+            DPRINTF(VEGA, "Handling V_ADD_F32 SRC DPP. SRC0: register v[%d], "
+                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
+                    "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
+                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
+                    extData.iFmt_VOP_DPP.DPP_CTRL,
+                    extData.iFmt_VOP_DPP.SRC0_ABS,
+                    extData.iFmt_VOP_DPP.SRC0_NEG,
+                    extData.iFmt_VOP_DPP.SRC1_ABS,
+                    extData.iFmt_VOP_DPP.SRC1_NEG,
+                    extData.iFmt_VOP_DPP.BC,
+                    extData.iFmt_VOP_DPP.BANK_MASK,
+                    extData.iFmt_VOP_DPP.ROW_MASK);
+
+            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0_dpp[lane] + src1[lane];
+                }
+            }
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0[lane] + src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_SUB_F32 class methods ---
+
+    Inst_VOP2__V_SUB_F32::Inst_VOP2__V_SUB_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_sub_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP2__V_SUB_F32
+
+    Inst_VOP2__V_SUB_F32::~Inst_VOP2__V_SUB_F32()
+    {
+    } // ~Inst_VOP2__V_SUB_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f - S1.f.
+    // SQ translates to V_ADD_F32.
+    void
+    Inst_VOP2__V_SUB_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] - src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_SUBREV_F32 class methods ---
+
+    Inst_VOP2__V_SUBREV_F32::Inst_VOP2__V_SUBREV_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_subrev_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP2__V_SUBREV_F32
+
+    Inst_VOP2__V_SUBREV_F32::~Inst_VOP2__V_SUBREV_F32()
+    {
+    } // ~Inst_VOP2__V_SUBREV_F32
+
+    // --- description from .arch file ---
+    // D.f = S1.f - S0.f.
+    // SQ translates to V_ADD_F32.
+    void
+    Inst_VOP2__V_SUBREV_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] - src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MUL_LEGACY_F32 class methods ---
+
+    Inst_VOP2__V_MUL_LEGACY_F32::Inst_VOP2__V_MUL_LEGACY_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mul_legacy_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP2__V_MUL_LEGACY_F32
+
+    Inst_VOP2__V_MUL_LEGACY_F32::~Inst_VOP2__V_MUL_LEGACY_F32()
+    {
+    } // ~Inst_VOP2__V_MUL_LEGACY_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f (DX9 rules, 0.0*x = 0.0).
+    void
+    Inst_VOP2__V_MUL_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] * src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MUL_F32 class methods ---
+
+    Inst_VOP2__V_MUL_F32::Inst_VOP2__V_MUL_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mul_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP2__V_MUL_F32
+
+    Inst_VOP2__V_MUL_F32::~Inst_VOP2__V_MUL_F32()
+    {
+    } // ~Inst_VOP2__V_MUL_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f.
+    void
+    Inst_VOP2__V_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isnan(src0[lane]) ||
+                    std::isnan(src1[lane])) {
+                    vdst[lane] = NAN;
+                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src0[lane]) == FP_ZERO) &&
+                           !std::signbit(src0[lane])) {
+                    if (std::isinf(src1[lane])) {
+                        vdst[lane] = NAN;
+                    } else if (!std::signbit(src1[lane])) {
+                        vdst[lane] = +0.0;
+                    } else {
+                        vdst[lane] = -0.0;
+                    }
+                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src0[lane]) == FP_ZERO) &&
+                           std::signbit(src0[lane])) {
+                    if (std::isinf(src1[lane])) {
+                        vdst[lane] = NAN;
+                    } else if (std::signbit(src1[lane])) {
+                        vdst[lane] = +0.0;
+                    } else {
+                        vdst[lane] = -0.0;
+                    }
+                } else if (std::isinf(src0[lane]) &&
+                           !std::signbit(src0[lane])) {
+                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src1[lane]) == FP_ZERO) {
+                        vdst[lane] = NAN;
+                    } else if (!std::signbit(src1[lane])) {
+                        vdst[lane] = +INFINITY;
+                    } else {
+                        vdst[lane] = -INFINITY;
+                    }
+                } else if (std::isinf(src0[lane]) &&
+                           std::signbit(src0[lane])) {
+                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src1[lane]) == FP_ZERO) {
+                        vdst[lane] = NAN;
+                    } else if (std::signbit(src1[lane])) {
+                        vdst[lane] = +INFINITY;
+                    } else {
+                        vdst[lane] = -INFINITY;
+                    }
+                } else {
+                    vdst[lane] = src0[lane] * src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MUL_I32_I24 class methods ---
+
+    Inst_VOP2__V_MUL_I32_I24::Inst_VOP2__V_MUL_I32_I24(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mul_i32_i24")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MUL_I32_I24
+
+    Inst_VOP2__V_MUL_I32_I24::~Inst_VOP2__V_MUL_I32_I24()
+    {
+    } // ~Inst_VOP2__V_MUL_I32_I24
+
+    // --- description from .arch file ---
+    // D.i = S0.i[23:0] * S1.i[23:0].
+    void
+    Inst_VOP2__V_MUL_I32_I24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
+                    * sext<24>(bits(src1[lane], 23, 0));
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MUL_HI_I32_I24 class methods ---
+
+    Inst_VOP2__V_MUL_HI_I32_I24::Inst_VOP2__V_MUL_HI_I32_I24(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mul_hi_i32_i24")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MUL_HI_I32_I24
+
+    Inst_VOP2__V_MUL_HI_I32_I24::~Inst_VOP2__V_MUL_HI_I32_I24()
+    {
+    } // ~Inst_VOP2__V_MUL_HI_I32_I24
+
+    // --- description from .arch file ---
+    // D.i = (S0.i[23:0] * S1.i[23:0])>>32.
+    void
+    Inst_VOP2__V_MUL_HI_I32_I24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemI64 tmp_src0
+                    = (VecElemI64)sext<24>(bits(src0[lane], 23, 0));
+                VecElemI64 tmp_src1
+                    = (VecElemI64)sext<24>(bits(src1[lane], 23, 0));
+
+                vdst[lane] = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MUL_U32_U24 class methods ---
+
+    Inst_VOP2__V_MUL_U32_U24::Inst_VOP2__V_MUL_U32_U24(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mul_u32_u24")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MUL_U32_U24
+
+    Inst_VOP2__V_MUL_U32_U24::~Inst_VOP2__V_MUL_U32_U24()
+    {
+    } // ~Inst_VOP2__V_MUL_U32_U24
+
+    // --- description from .arch file ---
+    // D.u = S0.u[23:0] * S1.u[23:0].
+    void
+    Inst_VOP2__V_MUL_U32_U24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        auto opImpl = [](VecOperandU32& src0, VecOperandU32& src1,
+                         VecOperandU32& vdst, Wavefront* wf) {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = bits(src0[lane], 23, 0) *
+                                 bits(src1[lane], 23, 0);
+                }
+            }
+        };
+
+        vop2Helper<ConstVecOperandU32, VecOperandU32>(gpuDynInst, opImpl);
+    } // execute
+    // --- Inst_VOP2__V_MUL_HI_U32_U24 class methods ---
+
+    Inst_VOP2__V_MUL_HI_U32_U24::Inst_VOP2__V_MUL_HI_U32_U24(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mul_hi_u32_u24")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MUL_HI_U32_U24
+
+    Inst_VOP2__V_MUL_HI_U32_U24::~Inst_VOP2__V_MUL_HI_U32_U24()
+    {
+    } // ~Inst_VOP2__V_MUL_HI_U32_U24
+
+    // --- description from .arch file ---
+    // D.i = (S0.u[23:0] * S1.u[23:0])>>32.
+    void
+    Inst_VOP2__V_MUL_HI_U32_U24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemU64 tmp_src0 = (VecElemU64)bits(src0[lane], 23, 0);
+                VecElemU64 tmp_src1 = (VecElemU64)bits(src1[lane], 23, 0);
+                vdst[lane] = (VecElemU32)((tmp_src0 * tmp_src1) >> 32);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MIN_F32 class methods ---
+
+    Inst_VOP2__V_MIN_F32::Inst_VOP2__V_MIN_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_min_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP2__V_MIN_F32
+
+    Inst_VOP2__V_MIN_F32::~Inst_VOP2__V_MIN_F32()
+    {
+    } // ~Inst_VOP2__V_MIN_F32
+
+    // --- description from .arch file ---
+    // D.f = (S0.f < S1.f ? S0.f : S1.f).
+    void
+    Inst_VOP2__V_MIN_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fmin(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MAX_F32 class methods ---
+
+    Inst_VOP2__V_MAX_F32::Inst_VOP2__V_MAX_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_max_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP2__V_MAX_F32
+
+    Inst_VOP2__V_MAX_F32::~Inst_VOP2__V_MAX_F32()
+    {
+    } // ~Inst_VOP2__V_MAX_F32
+
+    // --- description from .arch file ---
+    // D.f = (S0.f >= S1.f ? S0.f : S1.f).
+    void
+    Inst_VOP2__V_MAX_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fmax(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MIN_I32 class methods ---
+
+    Inst_VOP2__V_MIN_I32::Inst_VOP2__V_MIN_I32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_min_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MIN_I32
+
+    Inst_VOP2__V_MIN_I32::~Inst_VOP2__V_MIN_I32()
+    {
+    } // ~Inst_VOP2__V_MIN_I32
+
+    // --- description from .arch file ---
+    // D.i = min(S0.i, S1.i).
+    void
+    Inst_VOP2__V_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::min(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MAX_I32 class methods ---
+
+    Inst_VOP2__V_MAX_I32::Inst_VOP2__V_MAX_I32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_max_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MAX_I32
+
+    Inst_VOP2__V_MAX_I32::~Inst_VOP2__V_MAX_I32()
+    {
+    } // ~Inst_VOP2__V_MAX_I32
+
+    // --- description from .arch file ---
+    // D.i = max(S0.i, S1.i).
+    void
+    Inst_VOP2__V_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::max(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MIN_U32 class methods ---
+
+    Inst_VOP2__V_MIN_U32::Inst_VOP2__V_MIN_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_min_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MIN_U32
+
+    Inst_VOP2__V_MIN_U32::~Inst_VOP2__V_MIN_U32()
+    {
+    } // ~Inst_VOP2__V_MIN_U32
+
+    // --- description from .arch file ---
+    // D.u = min(S0.u, S1.u).
+    void
+    Inst_VOP2__V_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::min(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MAX_U32 class methods ---
+
+    Inst_VOP2__V_MAX_U32::Inst_VOP2__V_MAX_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_max_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MAX_U32
+
+    Inst_VOP2__V_MAX_U32::~Inst_VOP2__V_MAX_U32()
+    {
+    } // ~Inst_VOP2__V_MAX_U32
+
+    // --- description from .arch file ---
+    // D.u = max(S0.u, S1.u).
+    void
+    Inst_VOP2__V_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::max(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_LSHRREV_B32 class methods ---
+
+    Inst_VOP2__V_LSHRREV_B32::Inst_VOP2__V_LSHRREV_B32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_lshrrev_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_LSHRREV_B32
+
+    Inst_VOP2__V_LSHRREV_B32::~Inst_VOP2__V_LSHRREV_B32()
+    {
+    } // ~Inst_VOP2__V_LSHRREV_B32
+
+    // --- description from .arch file ---
+    // D.u = S1.u >> S0.u[4:0].
+    // The vacated bits are set to zero.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP2__V_LSHRREV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_ASHRREV_I32 class methods ---
+
+    Inst_VOP2__V_ASHRREV_I32::Inst_VOP2__V_ASHRREV_I32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_ashrrev_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_ASHRREV_I32
+
+    Inst_VOP2__V_ASHRREV_I32::~Inst_VOP2__V_ASHRREV_I32()
+    {
+    } // ~Inst_VOP2__V_ASHRREV_I32
+
+    // --- description from .arch file ---
+    // D.i = signext(S1.i) >> S0.i[4:0].
+    // The vacated bits are set to the sign bit of the input value.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP2__V_ASHRREV_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_LSHLREV_B32 class methods ---
+
+    Inst_VOP2__V_LSHLREV_B32::Inst_VOP2__V_LSHLREV_B32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_lshlrev_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_LSHLREV_B32
+
+    Inst_VOP2__V_LSHLREV_B32::~Inst_VOP2__V_LSHLREV_B32()
+    {
+    } // ~Inst_VOP2__V_LSHLREV_B32
+
+    // --- description from .arch file ---
+    // D.u = S1.u << S0.u[4:0].
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP2__V_LSHLREV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        if (isSDWAInst()) {
+            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
+            // use copies of original src0, src1, and vdst during selecting
+            VecOperandU32 origSrc0_sdwa(gpuDynInst,
+                                        extData.iFmt_VOP_SDWA.SRC0);
+            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
+            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
+
+            src0_sdwa.read();
+            origSrc0_sdwa.read();
+            origSrc1.read();
+
+            DPRINTF(VEGA, "Handling V_LSHLREV_B32 SRC SDWA. SRC0: register "
+                    "v[%d], DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: "
+                    "%d, SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: "
+                    "%d, SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
+                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
+                    extData.iFmt_VOP_SDWA.DST_U,
+                    extData.iFmt_VOP_SDWA.CLMP,
+                    extData.iFmt_VOP_SDWA.SRC0_SEL,
+                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
+                    extData.iFmt_VOP_SDWA.SRC0_NEG,
+                    extData.iFmt_VOP_SDWA.SRC0_ABS,
+                    extData.iFmt_VOP_SDWA.SRC1_SEL,
+                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
+                    extData.iFmt_VOP_SDWA.SRC1_NEG,
+                    extData.iFmt_VOP_SDWA.SRC1_ABS);
+
+            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
+                            src1, origSrc1);
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src1[lane] << bits(src0_sdwa[lane], 4, 0);
+                    origVdst[lane] = vdst[lane]; // keep copy consistent
+                }
+            }
+
+            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_AND_B32 class methods ---
+
+    Inst_VOP2__V_AND_B32::Inst_VOP2__V_AND_B32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_and_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_AND_B32
+
+    Inst_VOP2__V_AND_B32::~Inst_VOP2__V_AND_B32()
+    {
+    } // ~Inst_VOP2__V_AND_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u & S1.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP2__V_AND_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        if (isDPPInst()) {
+            VecOperandU32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
+            src0_dpp.read();
+
+            DPRINTF(VEGA, "Handling V_AND_B32 SRC DPP. SRC0: register v[%d], "
+                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
+                    "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
+                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
+                    extData.iFmt_VOP_DPP.DPP_CTRL,
+                    extData.iFmt_VOP_DPP.SRC0_ABS,
+                    extData.iFmt_VOP_DPP.SRC0_NEG,
+                    extData.iFmt_VOP_DPP.SRC1_ABS,
+                    extData.iFmt_VOP_DPP.SRC1_NEG,
+                    extData.iFmt_VOP_DPP.BC,
+                    extData.iFmt_VOP_DPP.BANK_MASK,
+                    extData.iFmt_VOP_DPP.ROW_MASK);
+
+            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0_dpp[lane] & src1[lane];
+                }
+            }
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0[lane] & src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_OR_B32 class methods ---
+
+    Inst_VOP2__V_OR_B32::Inst_VOP2__V_OR_B32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_or_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_OR_B32
+
+    Inst_VOP2__V_OR_B32::~Inst_VOP2__V_OR_B32()
+    {
+    } // ~Inst_VOP2__V_OR_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u | S1.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP2__V_OR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        if (isSDWAInst()) {
+            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
+            // use copies of original src0, src1, and dest during selecting
+            VecOperandU32 origSrc0_sdwa(gpuDynInst,
+                                        extData.iFmt_VOP_SDWA.SRC0);
+            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
+            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
+
+            src0_sdwa.read();
+            origSrc0_sdwa.read();
+            origSrc1.read();
+
+            DPRINTF(VEGA, "Handling V_OR_B32 SRC SDWA. SRC0: register v[%d], "
+                    "DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, "
+                    "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
+                    "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
+                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
+                    extData.iFmt_VOP_SDWA.DST_U,
+                    extData.iFmt_VOP_SDWA.CLMP,
+                    extData.iFmt_VOP_SDWA.SRC0_SEL,
+                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
+                    extData.iFmt_VOP_SDWA.SRC0_NEG,
+                    extData.iFmt_VOP_SDWA.SRC0_ABS,
+                    extData.iFmt_VOP_SDWA.SRC1_SEL,
+                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
+                    extData.iFmt_VOP_SDWA.SRC1_NEG,
+                    extData.iFmt_VOP_SDWA.SRC1_ABS);
+
+            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
+                            src1, origSrc1);
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0_sdwa[lane] | src1[lane];
+                    origVdst[lane] = vdst[lane]; // keep copy consistent
+                }
+            }
+
+            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0[lane] | src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_XOR_B32 class methods ---
+
+    Inst_VOP2__V_XOR_B32::Inst_VOP2__V_XOR_B32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_xor_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_XOR_B32
+
+    Inst_VOP2__V_XOR_B32::~Inst_VOP2__V_XOR_B32()
+    {
+    } // ~Inst_VOP2__V_XOR_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u ^ S1.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP2__V_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] ^ src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MAC_F32 class methods ---
+
+    Inst_VOP2__V_MAC_F32::Inst_VOP2__V_MAC_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mac_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(MAC);
+    } // Inst_VOP2__V_MAC_F32
+
+    Inst_VOP2__V_MAC_F32::~Inst_VOP2__V_MAC_F32()
+    {
+    } // ~Inst_VOP2__V_MAC_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f + D.f.
+    // SQ translates to V_MAD_F32.
+    void
+    Inst_VOP2__V_MAC_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        VecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+        vdst.read();
+
+        if (isDPPInst()) {
+            VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
+            src0_dpp.read();
+
+            DPRINTF(VEGA, "Handling V_MAC_F32 SRC DPP. SRC0: register v[%d], "
+                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
+                    "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
+                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
+                    extData.iFmt_VOP_DPP.DPP_CTRL,
+                    extData.iFmt_VOP_DPP.SRC0_ABS,
+                    extData.iFmt_VOP_DPP.SRC0_NEG,
+                    extData.iFmt_VOP_DPP.SRC1_ABS,
+                    extData.iFmt_VOP_DPP.SRC1_NEG,
+                    extData.iFmt_VOP_DPP.BC,
+                    extData.iFmt_VOP_DPP.BANK_MASK,
+                    extData.iFmt_VOP_DPP.ROW_MASK);
+
+            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = std::fma(src0_dpp[lane], src1[lane],
+                                          vdst[lane]);
+                }
+            }
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MADMK_F32 class methods ---
+
+    Inst_VOP2__V_MADMK_F32::Inst_VOP2__V_MADMK_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_madmk_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(MAD);
+    } // Inst_VOP2__V_MADMK_F32
+
+    Inst_VOP2__V_MADMK_F32::~Inst_VOP2__V_MADMK_F32()
+    {
+    } // ~Inst_VOP2__V_MADMK_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * K + S1.f; K is a 32-bit inline constant.
+    // This opcode cannot use the VOP3 encoding and cannot use input/output
+    // ---  modifiers.
+    // SQ translates to V_MAD_F32.
+    void
+    Inst_VOP2__V_MADMK_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+        VecElemF32 k = extData.imm_f32;
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fma(src0[lane], k, src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MADAK_F32 class methods ---
+
+    Inst_VOP2__V_MADAK_F32::Inst_VOP2__V_MADAK_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_madak_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(MAD);
+    } // Inst_VOP2__V_MADAK_F32
+
+    Inst_VOP2__V_MADAK_F32::~Inst_VOP2__V_MADAK_F32()
+    {
+    } // ~Inst_VOP2__V_MADAK_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f + K; K is a 32-bit inline constant.
+    // This opcode cannot use the VOP3 encoding and cannot use input/output
+    // ---  modifiers.
+    // SQ translates to V_MAD_F32.
+    void
+    Inst_VOP2__V_MADAK_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+        VecElemF32 k = extData.imm_f32;
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fma(src0[lane], src1[lane], k);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_ADD_CO_U32 class methods ---
+
+    Inst_VOP2__V_ADD_CO_U32::Inst_VOP2__V_ADD_CO_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_add_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+    } // Inst_VOP2__V_ADD_CO_U32
+
+    Inst_VOP2__V_ADD_CO_U32::~Inst_VOP2__V_ADD_CO_U32()
+    {
+    } // ~Inst_VOP2__V_ADD_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u + S1.u;
+    // VCC[threadId] = (S0.u + S1.u >= 0x800000000ULL ? 1 : 0) is an UNSIGNED
+    // ---  overflow or carry-out for V_ADDC_U32.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
+    void
+    Inst_VOP2__V_ADD_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        if (isSDWAInst()) {
+            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
+            // use copies of original src0, src1, and dest during selecting
+            VecOperandU32 origSrc0_sdwa(gpuDynInst,
+                                        extData.iFmt_VOP_SDWA.SRC0);
+            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
+            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
+
+            src0_sdwa.read();
+            origSrc0_sdwa.read();
+            origSrc1.read();
+
+            DPRINTF(VEGA, "Handling V_ADD_CO_U32 SRC SDWA. SRC0: register "
+                    "v[%d], DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, "
+                    "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
+                    "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
+                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
+                    extData.iFmt_VOP_SDWA.DST_U,
+                    extData.iFmt_VOP_SDWA.CLMP,
+                    extData.iFmt_VOP_SDWA.SRC0_SEL,
+                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
+                    extData.iFmt_VOP_SDWA.SRC0_NEG,
+                    extData.iFmt_VOP_SDWA.SRC0_ABS,
+                    extData.iFmt_VOP_SDWA.SRC1_SEL,
+                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
+                    extData.iFmt_VOP_SDWA.SRC1_NEG,
+                    extData.iFmt_VOP_SDWA.SRC1_ABS);
+
+            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
+                            src1, origSrc1);
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0_sdwa[lane] + src1[lane];
+                    origVdst[lane] = vdst[lane]; // keep copy consistent
+                    vcc.setBit(lane, ((VecElemU64)src0_sdwa[lane]
+                        + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
+                }
+            }
+
+            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0[lane] + src1[lane];
+                    vcc.setBit(lane, ((VecElemU64)src0[lane]
+                        + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
+                }
+            }
+        }
+
+        vcc.write();
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_SUB_CO_U32 class methods ---
+
+    Inst_VOP2__V_SUB_CO_U32::Inst_VOP2__V_SUB_CO_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_sub_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+    } // Inst_VOP2__V_SUB_CO_U32
+
+    Inst_VOP2__V_SUB_CO_U32::~Inst_VOP2__V_SUB_CO_U32()
+    {
+    } // ~Inst_VOP2__V_SUB_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u - S1.u;
+    // VCC[threadId] = (S1.u > S0.u ? 1 : 0) is an UNSIGNED overflow or
+    // carry-out for V_SUBB_U32.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
+    void
+    Inst_VOP2__V_SUB_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] - src1[lane];
+                vcc.setBit(lane, src1[lane] > src0[lane] ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        vcc.write();
+    } // execute
+    // --- Inst_VOP2__V_SUBREV_CO_U32 class methods ---
+
+    Inst_VOP2__V_SUBREV_CO_U32::Inst_VOP2__V_SUBREV_CO_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_subrev_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+    } // Inst_VOP2__V_SUBREV_CO_U32
+
+    Inst_VOP2__V_SUBREV_CO_U32::~Inst_VOP2__V_SUBREV_CO_U32()
+    {
+    } // ~Inst_VOP2__V_SUBREV_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S1.u - S0.u;
+    // VCC[threadId] = (S0.u > S1.u ? 1 : 0) is an UNSIGNED overflow or
+    // carry-out for V_SUBB_U32.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
+    void
+    Inst_VOP2__V_SUBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] - src0[lane];
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        vcc.write();
+    } // execute
+    // --- Inst_VOP2__V_ADDC_CO_U32 class methods ---
+
+    Inst_VOP2__V_ADDC_CO_U32::Inst_VOP2__V_ADDC_CO_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_addc_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(ReadsVCC);
+    } // Inst_VOP2__V_ADDC_CO_U32
+
+    Inst_VOP2__V_ADDC_CO_U32::~Inst_VOP2__V_ADDC_CO_U32()
+    {
+    } // ~Inst_VOP2__V_ADDC_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u + S1.u + VCC[threadId];
+    // VCC[threadId] = (S0.u + S1.u + VCC[threadId] >= 0x800000000ULL ? 1 : 0)
+    // is an UNSIGNED overflow.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
+    // source comes from the SGPR-pair at S2.u.
+    void
+    Inst_VOP2__V_ADDC_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+        vcc.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] + src1[lane]
+                    + bits(vcc.rawData(), lane);
+                vcc.setBit(lane, ((VecElemU64)src0[lane]
+                    + (VecElemU64)src1[lane]
+                        + (VecElemU64)bits(vcc.rawData(), lane, lane))
+                            >= 0x100000000 ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        vcc.write();
+    } // execute
+    // --- Inst_VOP2__V_SUBB_CO_U32 class methods ---
+
+    Inst_VOP2__V_SUBB_CO_U32::Inst_VOP2__V_SUBB_CO_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_subb_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(ReadsVCC);
+    } // Inst_VOP2__V_SUBB_CO_U32
+
+    Inst_VOP2__V_SUBB_CO_U32::~Inst_VOP2__V_SUBB_CO_U32()
+    {
+    } // ~Inst_VOP2__V_SUBB_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u - S1.u - VCC[threadId];
+    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
+    // ---  overflow.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
+    // ---  source comes from the SGPR-pair at S2.u.
+    void
+    Inst_VOP2__V_SUBB_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+        vcc.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane]
+                    = src0[lane] - src1[lane] - bits(vcc.rawData(), lane);
+                vcc.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
+                    > src0[lane] ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        vcc.write();
+    } // execute
+    // --- Inst_VOP2__V_SUBBREV_CO_U32 class methods ---
+
+    Inst_VOP2__V_SUBBREV_CO_U32::Inst_VOP2__V_SUBBREV_CO_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_subbrev_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(ReadsVCC);
+    } // Inst_VOP2__V_SUBBREV_CO_U32
+
+    Inst_VOP2__V_SUBBREV_CO_U32::~Inst_VOP2__V_SUBBREV_CO_U32()
+    {
+    } // ~Inst_VOP2__V_SUBBREV_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S1.u - S0.u - VCC[threadId];
+    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
+    // overflow.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
+    // source comes from the SGPR-pair at S2.u. SQ translates to V_SUBB_U32.
+    // SQ translates this to V_SUBREV_U32 with reversed operands.
+    void
+    Inst_VOP2__V_SUBBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+        vcc.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane]
+                    = src1[lane] - src0[lane] - bits(vcc.rawData(), lane);
+                vcc.setBit(lane, (src0[lane] + bits(vcc.rawData(), lane))
+                    > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        vcc.write();
+    } // execute
+    // --- Inst_VOP2__V_ADD_F16 class methods ---
+
+    Inst_VOP2__V_ADD_F16::Inst_VOP2__V_ADD_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_add_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP2__V_ADD_F16
+
+    Inst_VOP2__V_ADD_F16::~Inst_VOP2__V_ADD_F16()
+    {
+    } // ~Inst_VOP2__V_ADD_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 + S1.f16.
+    // Supports denormals, round mode, exception flags, saturation.
+    void
+    Inst_VOP2__V_ADD_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_SUB_F16 class methods ---
+
+    Inst_VOP2__V_SUB_F16::Inst_VOP2__V_SUB_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_sub_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP2__V_SUB_F16
+
+    Inst_VOP2__V_SUB_F16::~Inst_VOP2__V_SUB_F16()
+    {
+    } // ~Inst_VOP2__V_SUB_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 - S1.f16.
+    // Supports denormals, round mode, exception flags, saturation.
+    // SQ translates to V_ADD_F16.
+    void
+    Inst_VOP2__V_SUB_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_SUBREV_F16 class methods ---
+
+    Inst_VOP2__V_SUBREV_F16::Inst_VOP2__V_SUBREV_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_subrev_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP2__V_SUBREV_F16
+
+    Inst_VOP2__V_SUBREV_F16::~Inst_VOP2__V_SUBREV_F16()
+    {
+    } // ~Inst_VOP2__V_SUBREV_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S1.f16 - S0.f16.
+    // Supports denormals, round mode, exception flags, saturation.
+    // SQ translates to V_ADD_F16.
+    void
+    Inst_VOP2__V_SUBREV_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_MUL_F16 class methods ---
+
+    Inst_VOP2__V_MUL_F16::Inst_VOP2__V_MUL_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mul_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP2__V_MUL_F16
+
+    Inst_VOP2__V_MUL_F16::~Inst_VOP2__V_MUL_F16()
+    {
+    } // ~Inst_VOP2__V_MUL_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * S1.f16.
+    // Supports denormals, round mode, exception flags, saturation.
+    void
+    Inst_VOP2__V_MUL_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_MAC_F16 class methods ---
+
+    Inst_VOP2__V_MAC_F16::Inst_VOP2__V_MAC_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mac_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(MAC);
+    } // Inst_VOP2__V_MAC_F16
+
+    Inst_VOP2__V_MAC_F16::~Inst_VOP2__V_MAC_F16()
+    {
+    } // ~Inst_VOP2__V_MAC_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * S1.f16 + D.f16.
+    // Supports round mode, exception flags, saturation.
+    // SQ translates this to V_MAD_F16.
+    void
+    Inst_VOP2__V_MAC_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_MADMK_F16 class methods ---
+
+    Inst_VOP2__V_MADMK_F16::Inst_VOP2__V_MADMK_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_madmk_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(MAD);
+    } // Inst_VOP2__V_MADMK_F16
+
+    Inst_VOP2__V_MADMK_F16::~Inst_VOP2__V_MADMK_F16()
+    {
+    } // ~Inst_VOP2__V_MADMK_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * K.f16 + S1.f16; K is a 16-bit inline constant stored
+    // in the following literal DWORD.
+    // This opcode cannot use the VOP3 encoding and cannot use input/output
+    // modifiers. Supports round mode, exception flags, saturation.
+    // SQ translates this to V_MAD_F16.
+    void
+    Inst_VOP2__V_MADMK_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_MADAK_F16 class methods ---
+
+    Inst_VOP2__V_MADAK_F16::Inst_VOP2__V_MADAK_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_madak_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(MAD);
+    } // Inst_VOP2__V_MADAK_F16
+
+    Inst_VOP2__V_MADAK_F16::~Inst_VOP2__V_MADAK_F16()
+    {
+    } // ~Inst_VOP2__V_MADAK_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * S1.f16 + K.f16; K is a 16-bit inline constant stored
+    // in the following literal DWORD.
+    // This opcode cannot use the VOP3 encoding and cannot use input/output
+    // modifiers. Supports round mode, exception flags, saturation.
+    // SQ translates this to V_MAD_F16.
+    void
+    Inst_VOP2__V_MADAK_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_ADD_U16 class methods ---
+
+    Inst_VOP2__V_ADD_U16::Inst_VOP2__V_ADD_U16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_add_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_ADD_U16
+
+    Inst_VOP2__V_ADD_U16::~Inst_VOP2__V_ADD_U16()
+    {
+    } // ~Inst_VOP2__V_ADD_U16
+
+    // --- description from .arch file ---
+    // D.u16 = S0.u16 + S1.u16.
+    // Supports saturation (unsigned 16-bit integer domain).
+    void
+    Inst_VOP2__V_ADD_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] + src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_SUB_U16 class methods ---
+
+    Inst_VOP2__V_SUB_U16::Inst_VOP2__V_SUB_U16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_sub_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_SUB_U16
+
+    Inst_VOP2__V_SUB_U16::~Inst_VOP2__V_SUB_U16()
+    {
+    } // ~Inst_VOP2__V_SUB_U16
+
+    // --- description from .arch file ---
+    // D.u16 = S0.u16 - S1.u16.
+    // Supports saturation (unsigned 16-bit integer domain).
+    void
+    Inst_VOP2__V_SUB_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] - src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_SUBREV_U16 class methods ---
+
+    Inst_VOP2__V_SUBREV_U16::Inst_VOP2__V_SUBREV_U16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_subrev_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_SUBREV_U16
+
+    Inst_VOP2__V_SUBREV_U16::~Inst_VOP2__V_SUBREV_U16()
+    {
+    } // ~Inst_VOP2__V_SUBREV_U16
+
+    // --- description from .arch file ---
+    // D.u16 = S1.u16 - S0.u16.
+    // Supports saturation (unsigned 16-bit integer domain).
+    // SQ translates this to V_SUB_U16 with reversed operands.
+    void
+    Inst_VOP2__V_SUBREV_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] - src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MUL_LO_U16 class methods ---
+
+    Inst_VOP2__V_MUL_LO_U16::Inst_VOP2__V_MUL_LO_U16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mul_lo_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MUL_LO_U16
+
+    Inst_VOP2__V_MUL_LO_U16::~Inst_VOP2__V_MUL_LO_U16()
+    {
+    } // ~Inst_VOP2__V_MUL_LO_U16
+
+    // --- description from .arch file ---
+    // D.u16 = S0.u16 * S1.u16.
+    // Supports saturation (unsigned 16-bit integer domain).
+    void
+    Inst_VOP2__V_MUL_LO_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] * src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_LSHLREV_B16 class methods ---
+
+    Inst_VOP2__V_LSHLREV_B16::Inst_VOP2__V_LSHLREV_B16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_lshlrev_b16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_LSHLREV_B16
+
+    Inst_VOP2__V_LSHLREV_B16::~Inst_VOP2__V_LSHLREV_B16()
+    {
+    } // ~Inst_VOP2__V_LSHLREV_B16
+
+    // --- description from .arch file ---
+    // D.u[15:0] = S1.u[15:0] << S0.u[3:0].
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP2__V_LSHLREV_B16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] << bits(src0[lane], 3, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_LSHRREV_B16 class methods ---
+
+    Inst_VOP2__V_LSHRREV_B16::Inst_VOP2__V_LSHRREV_B16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_lshrrev_b16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_LSHRREV_B16
+
+    Inst_VOP2__V_LSHRREV_B16::~Inst_VOP2__V_LSHRREV_B16()
+    {
+    } // ~Inst_VOP2__V_LSHRREV_B16
+
+    // --- description from .arch file ---
+    // D.u[15:0] = S1.u[15:0] >> S0.u[3:0].
+    // The vacated bits are set to zero.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP2__V_LSHRREV_B16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] >> src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_ASHRREV_I16 class methods ---
+
+    Inst_VOP2__V_ASHRREV_I16::Inst_VOP2__V_ASHRREV_I16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_ashrrev_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_ASHRREV_I16
+
+    Inst_VOP2__V_ASHRREV_I16::~Inst_VOP2__V_ASHRREV_I16()
+    {
+    } // ~Inst_VOP2__V_ASHRREV_I16
+
+    // --- description from .arch file ---
+    // D.i[15:0] = signext(S1.i[15:0]) >> S0.i[3:0].
+    // The vacated bits are set to the sign bit of the input value.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP2__V_ASHRREV_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandI16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] >> src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MAX_F16 class methods ---
+
+    Inst_VOP2__V_MAX_F16::Inst_VOP2__V_MAX_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_max_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP2__V_MAX_F16
+
+    Inst_VOP2__V_MAX_F16::~Inst_VOP2__V_MAX_F16()
+    {
+    } // ~Inst_VOP2__V_MAX_F16
+
+    // --- description from .arch file ---
+    // D.f16 = max(S0.f16, S1.f16).
+    // IEEE compliant. Supports denormals, round mode, exception flags,
+    // saturation.
+    void
+    Inst_VOP2__V_MAX_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_MIN_F16 class methods ---
+
+    Inst_VOP2__V_MIN_F16::Inst_VOP2__V_MIN_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_min_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP2__V_MIN_F16
+
+    Inst_VOP2__V_MIN_F16::~Inst_VOP2__V_MIN_F16()
+    {
+    } // ~Inst_VOP2__V_MIN_F16
+
+    // --- description from .arch file ---
+    // D.f16 = min(S0.f16, S1.f16).
+    // IEEE compliant. Supports denormals, round mode, exception flags,
+    // saturation.
+    void
+    Inst_VOP2__V_MIN_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_MAX_U16 class methods ---
+
+    Inst_VOP2__V_MAX_U16::Inst_VOP2__V_MAX_U16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_max_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MAX_U16
+
+    Inst_VOP2__V_MAX_U16::~Inst_VOP2__V_MAX_U16()
+    {
+    } // ~Inst_VOP2__V_MAX_U16
+
+    // --- description from .arch file ---
+    // D.u[15:0] = max(S0.u[15:0], S1.u[15:0]).
+    void
+    Inst_VOP2__V_MAX_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::max(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MAX_I16 class methods ---
+
+    Inst_VOP2__V_MAX_I16::Inst_VOP2__V_MAX_I16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_max_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MAX_I16
+
+    Inst_VOP2__V_MAX_I16::~Inst_VOP2__V_MAX_I16()
+    {
+    } // ~Inst_VOP2__V_MAX_I16
+
+    // --- description from .arch file ---
+    // D.i[15:0] = max(S0.i[15:0], S1.i[15:0]).
+    void
+    Inst_VOP2__V_MAX_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandI16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::max(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MIN_U16 class methods ---
+
+    Inst_VOP2__V_MIN_U16::Inst_VOP2__V_MIN_U16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_min_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MIN_U16
+
+    Inst_VOP2__V_MIN_U16::~Inst_VOP2__V_MIN_U16()
+    {
+    } // ~Inst_VOP2__V_MIN_U16
+
+    // --- description from .arch file ---
+    // D.u[15:0] = min(S0.u[15:0], S1.u[15:0]).
+    void
+    Inst_VOP2__V_MIN_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::min(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MIN_I16 class methods ---
+
+    Inst_VOP2__V_MIN_I16::Inst_VOP2__V_MIN_I16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_min_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MIN_I16
+
+    Inst_VOP2__V_MIN_I16::~Inst_VOP2__V_MIN_I16()
+    {
+    } // ~Inst_VOP2__V_MIN_I16
+
+    // --- description from .arch file ---
+    // D.i[15:0] = min(S0.i[15:0], S1.i[15:0]).
+    void
+    Inst_VOP2__V_MIN_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandI16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::min(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_LDEXP_F16 class methods ---
+
+    Inst_VOP2__V_LDEXP_F16::Inst_VOP2__V_LDEXP_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_ldexp_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP2__V_LDEXP_F16
+
+    Inst_VOP2__V_LDEXP_F16::~Inst_VOP2__V_LDEXP_F16()
+    {
+    } // ~Inst_VOP2__V_LDEXP_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * (2 ** S1.i16).
+    void
+    Inst_VOP2__V_LDEXP_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_ADD_U32 class methods ---
+
+    Inst_VOP2__V_ADD_U32::Inst_VOP2__V_ADD_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_add_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_ADD_U32
+
+    Inst_VOP2__V_ADD_U32::~Inst_VOP2__V_ADD_U32()
+    {
+    } // ~Inst_VOP2__V_ADD_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u + S1.u;
+    void
+    Inst_VOP2__V_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        if (isSDWAInst()) {
+            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
+            // use copies of original src0, src1, and dest during selecting
+            VecOperandU32 origSrc0_sdwa(gpuDynInst,
+                                        extData.iFmt_VOP_SDWA.SRC0);
+            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
+            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
+
+            src0_sdwa.read();
+            origSrc0_sdwa.read();
+            origSrc1.read();
+
+            DPRINTF(VEGA, "Handling V_ADD_U32 SRC SDWA. SRC0: register v[%d], "
+                    "DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, "
+                    "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
+                    "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
+                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
+                    extData.iFmt_VOP_SDWA.DST_U,
+                    extData.iFmt_VOP_SDWA.CLMP,
+                    extData.iFmt_VOP_SDWA.SRC0_SEL,
+                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
+                    extData.iFmt_VOP_SDWA.SRC0_NEG,
+                    extData.iFmt_VOP_SDWA.SRC0_ABS,
+                    extData.iFmt_VOP_SDWA.SRC1_SEL,
+                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
+                    extData.iFmt_VOP_SDWA.SRC1_NEG,
+                    extData.iFmt_VOP_SDWA.SRC1_ABS);
+
+            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
+                            src1, origSrc1);
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0_sdwa[lane] + src1[lane];
+                    origVdst[lane] = vdst[lane]; // keep copy consistent
+                }
+            }
+
+            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0[lane] + src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_SUB_U32 class methods ---
+
+    Inst_VOP2__V_SUB_U32::Inst_VOP2__V_SUB_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_sub_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_SUB_U32
+
+    Inst_VOP2__V_SUB_U32::~Inst_VOP2__V_SUB_U32()
+    {
+    } // ~Inst_VOP2__V_SUB_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u - S1.u;
+    void
+    Inst_VOP2__V_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] - src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_SUBREV_U32 class methods ---
+
+    Inst_VOP2__V_SUBREV_U32::Inst_VOP2__V_SUBREV_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_subrev_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_SUBREV_U32
+
+    Inst_VOP2__V_SUBREV_U32::~Inst_VOP2__V_SUBREV_U32()
+    {
+    } // ~Inst_VOP2__V_SUBREV_U32
+
+    // --- description from .arch file ---
+    // D.u = S1.u - S0.u;
+    void
+    Inst_VOP2__V_SUBREV_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] - src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_FMAC_F32 class methods ---
+
+    Inst_VOP2__V_FMAC_F32::Inst_VOP2__V_FMAC_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_fmac_f32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_FMAC_F32
+
+    Inst_VOP2__V_FMAC_F32::~Inst_VOP2__V_FMAC_F32()
+    {
+    } // ~Inst_VOP2__V_FMAC_F32
+
+    // --- description from .arch file ---
+    // D.u = S1.u - S0.u;
+    void
+    Inst_VOP2__V_FMAC_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+        vdst.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/vop3.cc b/src/arch/amdgpu/vega/insts/vop3.cc
new file mode 100644
index 0000000000..8f6794c9c2
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/vop3.cc
@@ -0,0 +1,8906 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/inst_util.hh"
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_VOP3__V_CNDMASK_B32 class methods ---
+
+    Inst_VOP3__V_CNDMASK_B32::Inst_VOP3__V_CNDMASK_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cndmask_b32", false)
+    {
+        setFlag(ALU);
+        setFlag(ReadsVCC);
+    } // Inst_VOP3__V_CNDMASK_B32
+
+    Inst_VOP3__V_CNDMASK_B32::~Inst_VOP3__V_CNDMASK_B32()
+    {
+    } // ~Inst_VOP3__V_CNDMASK_B32
+
+    // --- description from .arch file ---
+    // D.u = (VCC[i] ? S1.u : S0.u) (i = threadID in wave); VOP3: specify VCC
+    // as a scalar GPR in S2.
+    void
+    Inst_VOP3__V_CNDMASK_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        vcc.read();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = bits(vcc.rawData(), lane)
+                    ? src1[lane] : src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ADD_F32 class methods ---
+
+    Inst_VOP3__V_ADD_F32::Inst_VOP3__V_ADD_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_add_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_ADD_F32
+
+    Inst_VOP3__V_ADD_F32::~Inst_VOP3__V_ADD_F32()
+    {
+    } // ~Inst_VOP3__V_ADD_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f + S1.f.
+    void
+    Inst_VOP3__V_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] + src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SUB_F32 class methods ---
+
+    Inst_VOP3__V_SUB_F32::Inst_VOP3__V_SUB_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sub_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_SUB_F32
+
+    Inst_VOP3__V_SUB_F32::~Inst_VOP3__V_SUB_F32()
+    {
+    } // ~Inst_VOP3__V_SUB_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f - S1.f.
+    // SQ translates to V_ADD_F32.
+    void
+    Inst_VOP3__V_SUB_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] - src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SUBREV_F32 class methods ---
+
+    Inst_VOP3__V_SUBREV_F32::Inst_VOP3__V_SUBREV_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_subrev_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_SUBREV_F32
+
+    Inst_VOP3__V_SUBREV_F32::~Inst_VOP3__V_SUBREV_F32()
+    {
+    } // ~Inst_VOP3__V_SUBREV_F32
+
+    // --- description from .arch file ---
+    // D.f = S1.f - S0.f.
+    // SQ translates to V_ADD_F32.
+    void
+    Inst_VOP3__V_SUBREV_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] - src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_LEGACY_F32 class methods ---
+
+    Inst_VOP3__V_MUL_LEGACY_F32::Inst_VOP3__V_MUL_LEGACY_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_legacy_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_MUL_LEGACY_F32
+
+    Inst_VOP3__V_MUL_LEGACY_F32::~Inst_VOP3__V_MUL_LEGACY_F32()
+    {
+    } // ~Inst_VOP3__V_MUL_LEGACY_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f (DX9 rules, 0.0*x = 0.0).
+    void
+    Inst_VOP3__V_MUL_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isnan(src0[lane]) ||
+                    std::isnan(src1[lane])) {
+                    vdst[lane] = NAN;
+                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src0[lane]) == FP_ZERO) &&
+                           !std::signbit(src0[lane])) {
+                    if (std::isinf(src1[lane])) {
+                        vdst[lane] = NAN;
+                    } else if (!std::signbit(src1[lane])) {
+                        vdst[lane] = +0.0;
+                    } else {
+                        vdst[lane] = -0.0;
+                    }
+                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src0[lane]) == FP_ZERO) &&
+                           std::signbit(src0[lane])) {
+                    if (std::isinf(src1[lane])) {
+                        vdst[lane] = NAN;
+                    } else if (std::signbit(src1[lane])) {
+                        vdst[lane] = +0.0;
+                    } else {
+                        vdst[lane] = -0.0;
+                    }
+                } else if (std::isinf(src0[lane]) &&
+                           !std::signbit(src0[lane])) {
+                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src1[lane]) == FP_ZERO) {
+                        vdst[lane] = NAN;
+                    } else if (!std::signbit(src1[lane])) {
+                        vdst[lane] = +INFINITY;
+                    } else {
+                        vdst[lane] = -INFINITY;
+                    }
+                } else if (std::isinf(src0[lane]) &&
+                           std::signbit(src0[lane])) {
+                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src1[lane]) == FP_ZERO) {
+                        vdst[lane] = NAN;
+                    } else if (std::signbit(src1[lane])) {
+                        vdst[lane] = +INFINITY;
+                    } else {
+                        vdst[lane] = -INFINITY;
+                    }
+                } else {
+                    vdst[lane] = src0[lane] * src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_F32 class methods ---
+
+    Inst_VOP3__V_MUL_F32::Inst_VOP3__V_MUL_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_MUL_F32
+
+    Inst_VOP3__V_MUL_F32::~Inst_VOP3__V_MUL_F32()
+    {
+    } // ~Inst_VOP3__V_MUL_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f.
+    void
+    Inst_VOP3__V_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isnan(src0[lane]) ||
+                    std::isnan(src1[lane])) {
+                    vdst[lane] = NAN;
+                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src0[lane]) == FP_ZERO) &&
+                           !std::signbit(src0[lane])) {
+                    if (std::isinf(src1[lane])) {
+                        vdst[lane] = NAN;
+                    } else if (!std::signbit(src1[lane])) {
+                        vdst[lane] = +0.0;
+                    } else {
+                        vdst[lane] = -0.0;
+                    }
+                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src0[lane]) == FP_ZERO) &&
+                           std::signbit(src0[lane])) {
+                    if (std::isinf(src1[lane])) {
+                        vdst[lane] = NAN;
+                    } else if (std::signbit(src1[lane])) {
+                        vdst[lane] = +0.0;
+                    } else {
+                        vdst[lane] = -0.0;
+                    }
+                } else if (std::isinf(src0[lane]) &&
+                           !std::signbit(src0[lane])) {
+                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src1[lane]) == FP_ZERO) {
+                        vdst[lane] = NAN;
+                    } else if (!std::signbit(src1[lane])) {
+                        vdst[lane] = +INFINITY;
+                    } else {
+                        vdst[lane] = -INFINITY;
+                    }
+                } else if (std::isinf(src0[lane]) &&
+                           std::signbit(src0[lane])) {
+                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src1[lane]) == FP_ZERO) {
+                        vdst[lane] = NAN;
+                    } else if (std::signbit(src1[lane])) {
+                        vdst[lane] = +INFINITY;
+                    } else {
+                        vdst[lane] = -INFINITY;
+                    }
+                } else {
+                    vdst[lane] = src0[lane] * src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_I32_I24 class methods ---
+
+    Inst_VOP3__V_MUL_I32_I24::Inst_VOP3__V_MUL_I32_I24(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_i32_i24", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MUL_I32_I24
+
+    Inst_VOP3__V_MUL_I32_I24::~Inst_VOP3__V_MUL_I32_I24()
+    {
+    } // ~Inst_VOP3__V_MUL_I32_I24
+
+    // --- description from .arch file ---
+    // D.i = S0.i[23:0] * S1.i[23:0].
+    void
+    Inst_VOP3__V_MUL_I32_I24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
+                    * sext<24>(bits(src1[lane], 23, 0));
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_HI_I32_I24 class methods ---
+
+    Inst_VOP3__V_MUL_HI_I32_I24::Inst_VOP3__V_MUL_HI_I32_I24(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_hi_i32_i24", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MUL_HI_I32_I24
+
+    Inst_VOP3__V_MUL_HI_I32_I24::~Inst_VOP3__V_MUL_HI_I32_I24()
+    {
+    } // ~Inst_VOP3__V_MUL_HI_I32_I24
+
+    // --- description from .arch file ---
+    // D.i = (S0.i[23:0] * S1.i[23:0])>>32.
+    void
+    Inst_VOP3__V_MUL_HI_I32_I24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemI64 tmp_src0
+                    = (VecElemI64)sext<24>(bits(src0[lane], 23, 0));
+                VecElemI64 tmp_src1
+                    = (VecElemI64)sext<24>(bits(src1[lane], 23, 0));
+
+                vdst[lane] = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_U32_U24 class methods ---
+
+    Inst_VOP3__V_MUL_U32_U24::Inst_VOP3__V_MUL_U32_U24(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_u32_u24", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MUL_U32_U24
+
+    Inst_VOP3__V_MUL_U32_U24::~Inst_VOP3__V_MUL_U32_U24()
+    {
+    } // ~Inst_VOP3__V_MUL_U32_U24
+
+    // --- description from .arch file ---
+    // D.u = S0.u[23:0] * S1.u[23:0].
+    void
+    Inst_VOP3__V_MUL_U32_U24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_HI_U32_U24 class methods ---
+
+    Inst_VOP3__V_MUL_HI_U32_U24::Inst_VOP3__V_MUL_HI_U32_U24(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_hi_u32_u24", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MUL_HI_U32_U24
+
+    Inst_VOP3__V_MUL_HI_U32_U24::~Inst_VOP3__V_MUL_HI_U32_U24()
+    {
+    } // ~Inst_VOP3__V_MUL_HI_U32_U24
+
+    // --- description from .arch file ---
+    // D.i = (S0.u[23:0] * S1.u[23:0])>>32.
+    void
+    Inst_VOP3__V_MUL_HI_U32_U24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemU64 tmp_src0 = (VecElemU64)bits(src0[lane], 23, 0);
+                VecElemU64 tmp_src1 = (VecElemU64)bits(src1[lane], 23, 0);
+                vdst[lane] = (VecElemU32)((tmp_src0 * tmp_src1) >> 32);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MIN_F32 class methods ---
+
+    Inst_VOP3__V_MIN_F32::Inst_VOP3__V_MIN_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_MIN_F32
+
+    Inst_VOP3__V_MIN_F32::~Inst_VOP3__V_MIN_F32()
+    {
+    } // ~Inst_VOP3__V_MIN_F32
+
+    // --- description from .arch file ---
+    // D.f = (S0.f < S1.f ? S0.f : S1.f).
+    void
+    Inst_VOP3__V_MIN_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fmin(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAX_F32 class methods ---
+
+    Inst_VOP3__V_MAX_F32::Inst_VOP3__V_MAX_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_MAX_F32
+
+    Inst_VOP3__V_MAX_F32::~Inst_VOP3__V_MAX_F32()
+    {
+    } // ~Inst_VOP3__V_MAX_F32
+
+    // --- description from .arch file ---
+    // D.f = (S0.f >= S1.f ? S0.f : S1.f).
+    void
+    Inst_VOP3__V_MAX_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fmax(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MIN_I32 class methods ---
+
+    Inst_VOP3__V_MIN_I32::Inst_VOP3__V_MIN_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MIN_I32
+
+    Inst_VOP3__V_MIN_I32::~Inst_VOP3__V_MIN_I32()
+    {
+    } // ~Inst_VOP3__V_MIN_I32
+
+    // --- description from .arch file ---
+    // D.i = min(S0.i, S1.i).
+    void
+    Inst_VOP3__V_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::min(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAX_I32 class methods ---
+
+    Inst_VOP3__V_MAX_I32::Inst_VOP3__V_MAX_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MAX_I32
+
+    Inst_VOP3__V_MAX_I32::~Inst_VOP3__V_MAX_I32()
+    {
+    } // ~Inst_VOP3__V_MAX_I32
+
+    // --- description from .arch file ---
+    // D.i = max(S0.i, S1.i).
+    void
+    Inst_VOP3__V_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::max(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MIN_U32 class methods ---
+
+    Inst_VOP3__V_MIN_U32::Inst_VOP3__V_MIN_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MIN_U32
+
+    Inst_VOP3__V_MIN_U32::~Inst_VOP3__V_MIN_U32()
+    {
+    } // ~Inst_VOP3__V_MIN_U32
+
+    // --- description from .arch file ---
+    // D.u = min(S0.u, S1.u).
+    void
+    Inst_VOP3__V_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::min(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAX_U32 class methods ---
+
+    Inst_VOP3__V_MAX_U32::Inst_VOP3__V_MAX_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MAX_U32
+
+    Inst_VOP3__V_MAX_U32::~Inst_VOP3__V_MAX_U32()
+    {
+    } // ~Inst_VOP3__V_MAX_U32
+
+    // --- description from .arch file ---
+    // D.u = max(S0.u, S1.u).
+    void
+    Inst_VOP3__V_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::max(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LSHRREV_B32 class methods ---
+
+    Inst_VOP3__V_LSHRREV_B32::Inst_VOP3__V_LSHRREV_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lshrrev_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LSHRREV_B32
+
+    Inst_VOP3__V_LSHRREV_B32::~Inst_VOP3__V_LSHRREV_B32()
+    {
+    } // ~Inst_VOP3__V_LSHRREV_B32
+
+    // --- description from .arch file ---
+    // D.u = S1.u >> S0.u[4:0].
+    // The vacated bits are set to zero.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP3__V_LSHRREV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ASHRREV_I32 class methods ---
+
+    Inst_VOP3__V_ASHRREV_I32::Inst_VOP3__V_ASHRREV_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ashrrev_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_ASHRREV_I32
+
+    Inst_VOP3__V_ASHRREV_I32::~Inst_VOP3__V_ASHRREV_I32()
+    {
+    } // ~Inst_VOP3__V_ASHRREV_I32
+
+    // --- description from .arch file ---
+    // D.i = signext(S1.i) >> S0.i[4:0].
+    // The vacated bits are set to the sign bit of the input value.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP3__V_ASHRREV_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LSHLREV_B32 class methods ---
+
+    Inst_VOP3__V_LSHLREV_B32::Inst_VOP3__V_LSHLREV_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lshlrev_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LSHLREV_B32
+
+    Inst_VOP3__V_LSHLREV_B32::~Inst_VOP3__V_LSHLREV_B32()
+    {
+    } // ~Inst_VOP3__V_LSHLREV_B32
+
+    // --- description from .arch file ---
+    // D.u = S1.u << S0.u[4:0].
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP3__V_LSHLREV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_AND_B32 class methods ---
+
+    Inst_VOP3__V_AND_B32::Inst_VOP3__V_AND_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_and_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_AND_B32
+
+    Inst_VOP3__V_AND_B32::~Inst_VOP3__V_AND_B32()
+    {
+    } // ~Inst_VOP3__V_AND_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u & S1.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP3__V_AND_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] & src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_OR_B32 class methods ---
+
+    Inst_VOP3__V_OR_B32::Inst_VOP3__V_OR_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_or_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_OR_B32
+
+    Inst_VOP3__V_OR_B32::~Inst_VOP3__V_OR_B32()
+    {
+    } // ~Inst_VOP3__V_OR_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u | S1.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP3__V_OR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] | src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_OR3_B32 class methods ---
+
+    Inst_VOP3__V_OR3_B32::Inst_VOP3__V_OR3_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_or3_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_OR3_B32
+
+    Inst_VOP3__V_OR3_B32::~Inst_VOP3__V_OR3_B32()
+    {
+    } // ~Inst_VOP3__V_OR3_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u | S1.u | S2.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP3__V_OR3_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] | src1[lane] | src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_XOR_B32 class methods ---
+
+    Inst_VOP3__V_XOR_B32::Inst_VOP3__V_XOR_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_xor_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_XOR_B32
+
+    Inst_VOP3__V_XOR_B32::~Inst_VOP3__V_XOR_B32()
+    {
+    } // ~Inst_VOP3__V_XOR_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u ^ S1.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP3__V_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] ^ src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAC_F32 class methods ---
+
+    Inst_VOP3__V_MAC_F32::Inst_VOP3__V_MAC_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mac_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(MAC);
+    } // Inst_VOP3__V_MAC_F32
+
+    Inst_VOP3__V_MAC_F32::~Inst_VOP3__V_MAC_F32()
+    {
+    } // ~Inst_VOP3__V_MAC_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f + D.f.
+    // SQ translates to V_MAD_F32.
+    void
+    Inst_VOP3__V_MAC_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        vdst.read();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ADD_CO_U32 class methods ---
+
+    Inst_VOP3__V_ADD_CO_U32::Inst_VOP3__V_ADD_CO_U32(InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_add_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+    } // Inst_VOP3__V_ADD_CO_U32
+
+    Inst_VOP3__V_ADD_CO_U32::~Inst_VOP3__V_ADD_CO_U32()
+    {
+    } // ~Inst_VOP3__V_ADD_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u + S1.u;
+    // VCC[threadId] = (S0.u + S1.u >= 0x800000000ULL ? 1 : 0) is an UNSIGNED
+    // ---  overflow or carry-out for V_ADDC_U32.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
+    void
+    Inst_VOP3__V_ADD_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] + src1[lane];
+                vcc.setBit(lane, ((VecElemU64)src0[lane]
+                    + (VecElemU64)src1[lane]) >= 0x100000000ULL ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        vcc.write();
+    } // execute
+    // --- Inst_VOP3__V_SUB_CO_U32 class methods ---
+
+    Inst_VOP3__V_SUB_CO_U32::Inst_VOP3__V_SUB_CO_U32(InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_sub_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+    } // Inst_VOP3__V_SUB_CO_U32
+
+    Inst_VOP3__V_SUB_CO_U32::~Inst_VOP3__V_SUB_CO_U32()
+    {
+    } // ~Inst_VOP3__V_SUB_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u - S1.u;
+    // VCC[threadId] = (S1.u > S0.u ? 1 : 0) is an UNSIGNED overflow or
+    // carry-out for V_SUBB_U32.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
+    void
+    Inst_VOP3__V_SUB_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] - src1[lane];
+                vcc.setBit(lane, src1[lane] > src0[lane] ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        vcc.write();
+    } // execute
+    // --- Inst_VOP3__V_SUBREV_CO_U32 class methods ---
+
+    Inst_VOP3__V_SUBREV_CO_U32::Inst_VOP3__V_SUBREV_CO_U32(
+          InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_subrev_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+    } // Inst_VOP3__V_SUBREV_CO_U32
+
+    Inst_VOP3__V_SUBREV_CO_U32::~Inst_VOP3__V_SUBREV_CO_U32()
+    {
+    } // ~Inst_VOP3__V_SUBREV_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S1.u - S0.u;
+    // VCC[threadId] = (S0.u > S1.u ? 1 : 0) is an UNSIGNED overflow or
+    // carry-out for V_SUBB_U32.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
+    // SQ translates this to V_SUB_U32 with reversed operands.
+    void
+    Inst_VOP3__V_SUBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] - src0[lane];
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        vcc.write();
+    } // execute
+    // --- Inst_VOP3__V_ADDC_CO_U32 class methods ---
+
+    Inst_VOP3__V_ADDC_CO_U32::Inst_VOP3__V_ADDC_CO_U32(InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_addc_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(ReadsVCC);
+    } // Inst_VOP3__V_ADDC_CO_U32
+
+    Inst_VOP3__V_ADDC_CO_U32::~Inst_VOP3__V_ADDC_CO_U32()
+    {
+    } // ~Inst_VOP3__V_ADDC_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u + S1.u + VCC[threadId];
+    // VCC[threadId] = (S0.u + S1.u + VCC[threadId] >= 0x800000000ULL ? 1 : 0)
+    // is an UNSIGNED overflow.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
+    // source comes from the SGPR-pair at S2.u.
+    void
+    Inst_VOP3__V_ADDC_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        vcc.read();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] + src1[lane]
+                    + bits(vcc.rawData(), lane);
+                sdst.setBit(lane, ((VecElemU64)src0[lane]
+                    + (VecElemU64)src1[lane]
+                        + (VecElemU64)bits(vcc.rawData(), lane))
+                            >= 0x100000000 ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SUBB_CO_U32 class methods ---
+
+    Inst_VOP3__V_SUBB_CO_U32::Inst_VOP3__V_SUBB_CO_U32(InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_subb_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(ReadsVCC);
+    } // Inst_VOP3__V_SUBB_CO_U32
+
+    Inst_VOP3__V_SUBB_CO_U32::~Inst_VOP3__V_SUBB_CO_U32()
+    {
+    } // ~Inst_VOP3__V_SUBB_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u - S1.u - VCC[threadId];
+    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
+    // ---  overflow.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
+    // ---  source comes from the SGPR-pair at S2.u.
+    void
+    Inst_VOP3__V_SUBB_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        vcc.read();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] - src1[lane]
+                    - bits(vcc.rawData(), lane);
+                sdst.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
+                    > src0[lane] ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SUBBREV_CO_U32 class methods ---
+
+    Inst_VOP3__V_SUBBREV_CO_U32::Inst_VOP3__V_SUBBREV_CO_U32(
+          InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_subbrev_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(ReadsVCC);
+    } // Inst_VOP3__V_SUBBREV_CO_U32
+
+    Inst_VOP3__V_SUBBREV_CO_U32::~Inst_VOP3__V_SUBBREV_CO_U32()
+    {
+    } // ~Inst_VOP3__V_SUBBREV_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S1.u - S0.u - VCC[threadId];
+    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
+    // overflow.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
+    // source comes from the SGPR-pair at S2.u. SQ translates to V_SUBB_U32.
+    void
+    Inst_VOP3__V_SUBBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        vcc.read();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] - src0[lane]
+                    - bits(vcc.rawData(), lane);
+                sdst.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
+                    > src0[lane] ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ADD_F16 class methods ---
+
+    Inst_VOP3__V_ADD_F16::Inst_VOP3__V_ADD_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_add_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_ADD_F16
+
+    Inst_VOP3__V_ADD_F16::~Inst_VOP3__V_ADD_F16()
+    {
+    } // ~Inst_VOP3__V_ADD_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 + S1.f16.
+    // Supports denormals, round mode, exception flags, saturation.
+    void
+    Inst_VOP3__V_ADD_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_SUB_F16 class methods ---
+
+    Inst_VOP3__V_SUB_F16::Inst_VOP3__V_SUB_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sub_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_SUB_F16
+
+    Inst_VOP3__V_SUB_F16::~Inst_VOP3__V_SUB_F16()
+    {
+    } // ~Inst_VOP3__V_SUB_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 - S1.f16.
+    // Supports denormals, round mode, exception flags, saturation.
+    // SQ translates to V_ADD_F16.
+    void
+    Inst_VOP3__V_SUB_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_SUBREV_F16 class methods ---
+
+    Inst_VOP3__V_SUBREV_F16::Inst_VOP3__V_SUBREV_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_subrev_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_SUBREV_F16
+
+    Inst_VOP3__V_SUBREV_F16::~Inst_VOP3__V_SUBREV_F16()
+    {
+    } // ~Inst_VOP3__V_SUBREV_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S1.f16 - S0.f16.
+    // Supports denormals, round mode, exception flags, saturation.
+    // SQ translates to V_ADD_F16.
+    void
+    Inst_VOP3__V_SUBREV_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_MUL_F16 class methods ---
+
+    Inst_VOP3__V_MUL_F16::Inst_VOP3__V_MUL_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_MUL_F16
+
+    Inst_VOP3__V_MUL_F16::~Inst_VOP3__V_MUL_F16()
+    {
+    } // ~Inst_VOP3__V_MUL_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * S1.f16.
+    // Supports denormals, round mode, exception flags, saturation.
+    void
+    Inst_VOP3__V_MUL_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_MAC_F16 class methods ---
+
+    Inst_VOP3__V_MAC_F16::Inst_VOP3__V_MAC_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mac_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(MAC);
+    } // Inst_VOP3__V_MAC_F16
+
+    Inst_VOP3__V_MAC_F16::~Inst_VOP3__V_MAC_F16()
+    {
+    } // ~Inst_VOP3__V_MAC_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * S1.f16 + D.f16.
+    // Supports round mode, exception flags, saturation.
+    // SQ translates this to V_MAD_F16.
+    void
+    Inst_VOP3__V_MAC_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_ADD_U16 class methods ---
+
+    Inst_VOP3__V_ADD_U16::Inst_VOP3__V_ADD_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_add_u16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_ADD_U16
+
+    Inst_VOP3__V_ADD_U16::~Inst_VOP3__V_ADD_U16()
+    {
+    } // ~Inst_VOP3__V_ADD_U16
+
+    // --- description from .arch file ---
+    // D.u16 = S0.u16 + S1.u16.
+    // Supports saturation (unsigned 16-bit integer domain).
+    void
+    Inst_VOP3__V_ADD_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] + src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SUB_U16 class methods ---
+
+    Inst_VOP3__V_SUB_U16::Inst_VOP3__V_SUB_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sub_u16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_SUB_U16
+
+    Inst_VOP3__V_SUB_U16::~Inst_VOP3__V_SUB_U16()
+    {
+    } // ~Inst_VOP3__V_SUB_U16
+
+    // --- description from .arch file ---
+    // D.u16 = S0.u16 - S1.u16.
+    // Supports saturation (unsigned 16-bit integer domain).
+    void
+    Inst_VOP3__V_SUB_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] - src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SUBREV_U16 class methods ---
+
+    Inst_VOP3__V_SUBREV_U16::Inst_VOP3__V_SUBREV_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_subrev_u16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_SUBREV_U16
+
+    Inst_VOP3__V_SUBREV_U16::~Inst_VOP3__V_SUBREV_U16()
+    {
+    } // ~Inst_VOP3__V_SUBREV_U16
+
+    // --- description from .arch file ---
+    // D.u16 = S1.u16 - S0.u16.
+    // Supports saturation (unsigned 16-bit integer domain).
+    // SQ translates this to V_SUB_U16 with reversed operands.
+    void
+    Inst_VOP3__V_SUBREV_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] - src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_LO_U16 class methods ---
+
+    Inst_VOP3__V_MUL_LO_U16::Inst_VOP3__V_MUL_LO_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_lo_u16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MUL_LO_U16
+
+    Inst_VOP3__V_MUL_LO_U16::~Inst_VOP3__V_MUL_LO_U16()
+    {
+    } // ~Inst_VOP3__V_MUL_LO_U16
+
+    // --- description from .arch file ---
+    // D.u16 = S0.u16 * S1.u16.
+    // Supports saturation (unsigned 16-bit integer domain).
+    void
+    Inst_VOP3__V_MUL_LO_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] * src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LSHLREV_B16 class methods ---
+
+    Inst_VOP3__V_LSHLREV_B16::Inst_VOP3__V_LSHLREV_B16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lshlrev_b16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LSHLREV_B16
+
+    Inst_VOP3__V_LSHLREV_B16::~Inst_VOP3__V_LSHLREV_B16()
+    {
+    } // ~Inst_VOP3__V_LSHLREV_B16
+
+    // --- description from .arch file ---
+    // D.u[15:0] = S1.u[15:0] << S0.u[3:0].
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP3__V_LSHLREV_B16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] << bits(src0[lane], 3, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LSHRREV_B16 class methods ---
+
+    Inst_VOP3__V_LSHRREV_B16::Inst_VOP3__V_LSHRREV_B16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lshrrev_b16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LSHRREV_B16
+
+    Inst_VOP3__V_LSHRREV_B16::~Inst_VOP3__V_LSHRREV_B16()
+    {
+    } // ~Inst_VOP3__V_LSHRREV_B16
+
+    // --- description from .arch file ---
+    // D.u[15:0] = S1.u[15:0] >> S0.u[3:0].
+    // The vacated bits are set to zero.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP3__V_LSHRREV_B16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] >> bits(src0[lane], 3, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ASHRREV_I16 class methods ---
+
+    Inst_VOP3__V_ASHRREV_I16::Inst_VOP3__V_ASHRREV_I16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ashrrev_i16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_ASHRREV_I16
+
+    Inst_VOP3__V_ASHRREV_I16::~Inst_VOP3__V_ASHRREV_I16()
+    {
+    } // ~Inst_VOP3__V_ASHRREV_I16
+
+    // --- description from .arch file ---
+    // D.i[15:0] = signext(S1.i[15:0]) >> S0.i[3:0].
+    // The vacated bits are set to the sign bit of the input value.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP3__V_ASHRREV_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        VecOperandI16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] >> bits(src0[lane], 3, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAX_F16 class methods ---
+
+    Inst_VOP3__V_MAX_F16::Inst_VOP3__V_MAX_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_MAX_F16
+
+    Inst_VOP3__V_MAX_F16::~Inst_VOP3__V_MAX_F16()
+    {
+    } // ~Inst_VOP3__V_MAX_F16
+
+    // --- description from .arch file ---
+    // D.f16 = max(S0.f16, S1.f16).
+    // IEEE compliant. Supports denormals, round mode, exception flags,
+    // saturation.
+    void
+    Inst_VOP3__V_MAX_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_MIN_F16 class methods ---
+
+    Inst_VOP3__V_MIN_F16::Inst_VOP3__V_MIN_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_MIN_F16
+
+    Inst_VOP3__V_MIN_F16::~Inst_VOP3__V_MIN_F16()
+    {
+    } // ~Inst_VOP3__V_MIN_F16
+
+    // --- description from .arch file ---
+    // D.f16 = min(S0.f16, S1.f16).
+    // IEEE compliant. Supports denormals, round mode, exception flags,
+    // saturation.
+    void
+    Inst_VOP3__V_MIN_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_MAX_U16 class methods ---
+
+    Inst_VOP3__V_MAX_U16::Inst_VOP3__V_MAX_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max_u16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MAX_U16
+
+    Inst_VOP3__V_MAX_U16::~Inst_VOP3__V_MAX_U16()
+    {
+    } // ~Inst_VOP3__V_MAX_U16
+
+    // --- description from .arch file ---
+    // D.u[15:0] = max(S0.u[15:0], S1.u[15:0]).
+    void
+    Inst_VOP3__V_MAX_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::max(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAX_I16 class methods ---
+
+    Inst_VOP3__V_MAX_I16::Inst_VOP3__V_MAX_I16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max_i16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MAX_I16
+
+    Inst_VOP3__V_MAX_I16::~Inst_VOP3__V_MAX_I16()
+    {
+    } // ~Inst_VOP3__V_MAX_I16
+
+    // --- description from .arch file ---
+    // D.i[15:0] = max(S0.i[15:0], S1.i[15:0]).
+    void
+    Inst_VOP3__V_MAX_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        VecOperandI16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::max(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MIN_U16 class methods ---
+
+    Inst_VOP3__V_MIN_U16::Inst_VOP3__V_MIN_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min_u16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MIN_U16
+
+    Inst_VOP3__V_MIN_U16::~Inst_VOP3__V_MIN_U16()
+    {
+    } // ~Inst_VOP3__V_MIN_U16
+
+    // --- description from .arch file ---
+    // D.u[15:0] = min(S0.u[15:0], S1.u[15:0]).
+    void
+    Inst_VOP3__V_MIN_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::min(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MIN_I16 class methods ---
+
+    Inst_VOP3__V_MIN_I16::Inst_VOP3__V_MIN_I16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min_i16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MIN_I16
+
+    Inst_VOP3__V_MIN_I16::~Inst_VOP3__V_MIN_I16()
+    {
+    } // ~Inst_VOP3__V_MIN_I16
+
+    // --- description from .arch file ---
+    // D.i[15:0] = min(S0.i[15:0], S1.i[15:0]).
+    void
+    Inst_VOP3__V_MIN_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        VecOperandI16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::min(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LDEXP_F16 class methods ---
+
+    Inst_VOP3__V_LDEXP_F16::Inst_VOP3__V_LDEXP_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ldexp_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_LDEXP_F16
+
+    Inst_VOP3__V_LDEXP_F16::~Inst_VOP3__V_LDEXP_F16()
+    {
+    } // ~Inst_VOP3__V_LDEXP_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * (2 ** S1.i16).
+    void
+    Inst_VOP3__V_LDEXP_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_ADD_U32 class methods ---
+
+    Inst_VOP3__V_ADD_U32::Inst_VOP3__V_ADD_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_add_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_ADD_U32
+
+    Inst_VOP3__V_ADD_U32::~Inst_VOP3__V_ADD_U32()
+    {
+    } // ~Inst_VOP3__V_ADD_U32
+
+    // --- description from .arch file ---
+    // D.u32 = S0.u32 + S1.u32.
+    void
+    Inst_VOP3__V_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] + src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SUB_U32 class methods ---
+
+    Inst_VOP3__V_SUB_U32::Inst_VOP3__V_SUB_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sub_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_SUB_U32
+
+    Inst_VOP3__V_SUB_U32::~Inst_VOP3__V_SUB_U32()
+    {
+    } // ~Inst_VOP3__V_SUB_U32
+
+    // --- description from .arch file ---
+    // D.u32 = S0.u32 - S1.u32.
+    void
+    Inst_VOP3__V_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] - src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SUBREV_U32 class methods ---
+
+    Inst_VOP3__V_SUBREV_U32::Inst_VOP3__V_SUBREV_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_subrev_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_SUBREV_U32
+
+    Inst_VOP3__V_SUBREV_U32::~Inst_VOP3__V_SUBREV_U32()
+    {
+    } // ~Inst_VOP3__V_SUBREV_U32
+
+    // --- description from .arch file ---
+    // D.u32 = S1.u32 - S0.u32.
+    void
+    Inst_VOP3__V_SUBREV_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] - src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_NOP class methods ---
+
+    Inst_VOP3__V_NOP::Inst_VOP3__V_NOP(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_nop", false)
+    {
+        setFlag(Nop);
+        setFlag(ALU);
+    } // Inst_VOP3__V_NOP
+
+    Inst_VOP3__V_NOP::~Inst_VOP3__V_NOP()
+    {
+    } // ~Inst_VOP3__V_NOP
+
+    // --- description from .arch file ---
+    // Do nothing.
+    void
+    Inst_VOP3__V_NOP::execute(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_VOP3__V_MOV_B32 class methods ---
+
+    Inst_VOP3__V_MOV_B32::Inst_VOP3__V_MOV_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mov_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MOV_B32
+
+    Inst_VOP3__V_MOV_B32::~Inst_VOP3__V_MOV_B32()
+    {
+    } // ~Inst_VOP3__V_MOV_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u.
+    // Input and output modifiers not supported; this is an untyped operation.
+    void
+    Inst_VOP3__V_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_I32_F64 class methods ---
+
+    Inst_VOP3__V_CVT_I32_F64::Inst_VOP3__V_CVT_I32_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_i32_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CVT_I32_F64
+
+    Inst_VOP3__V_CVT_I32_F64::~Inst_VOP3__V_CVT_I32_F64()
+    {
+    } // ~Inst_VOP3__V_CVT_I32_F64
+
+    // --- description from .arch file ---
+    // D.i = (int)S0.d.
+    // Out-of-range floating point values (including infinity) saturate. NaN is
+    // ---  converted to 0.
+    void
+    Inst_VOP3__V_CVT_I32_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int exp;
+                std::frexp(src[lane],&exp);
+                if (std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else if (std::isinf(src[lane]) || exp > 30) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = INT_MIN;
+                    } else {
+                        vdst[lane] = INT_MAX;
+                    }
+                } else {
+                    vdst[lane] = (VecElemI32)src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F64_I32 class methods ---
+
+    Inst_VOP3__V_CVT_F64_I32::Inst_VOP3__V_CVT_F64_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f64_i32", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CVT_F64_I32
+
+    Inst_VOP3__V_CVT_F64_I32::~Inst_VOP3__V_CVT_F64_I32()
+    {
+    } // ~Inst_VOP3__V_CVT_F64_I32
+
+    // --- description from .arch file ---
+    // D.d = (double)S0.i.
+    void
+    Inst_VOP3__V_CVT_F64_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF64)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F32_I32 class methods ---
+
+    Inst_VOP3__V_CVT_F32_I32::Inst_VOP3__V_CVT_F32_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f32_i32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_F32_I32
+
+    Inst_VOP3__V_CVT_F32_I32::~Inst_VOP3__V_CVT_F32_I32()
+    {
+    } // ~Inst_VOP3__V_CVT_F32_I32
+
+    // --- description from .arch file ---
+    // D.f = (float)S0.i.
+    void
+    Inst_VOP3__V_CVT_F32_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        VecOperandI32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F32_U32 class methods ---
+
+    Inst_VOP3__V_CVT_F32_U32::Inst_VOP3__V_CVT_F32_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f32_u32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_F32_U32
+
+    Inst_VOP3__V_CVT_F32_U32::~Inst_VOP3__V_CVT_F32_U32()
+    {
+    } // ~Inst_VOP3__V_CVT_F32_U32
+
+    // --- description from .arch file ---
+    // D.f = (float)S0.u.
+    void
+    Inst_VOP3__V_CVT_F32_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_U32_F32 class methods ---
+
+    Inst_VOP3__V_CVT_U32_F32::Inst_VOP3__V_CVT_U32_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_u32_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_U32_F32
+
+    Inst_VOP3__V_CVT_U32_F32::~Inst_VOP3__V_CVT_U32_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_U32_F32
+
+    // --- description from .arch file ---
+    // D.u = (unsigned)S0.f.
+    // Out-of-range floating point values (including infinity) saturate. NaN is
+    // ---  converted to 0.
+    void
+    Inst_VOP3__V_CVT_U32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int exp;
+                std::frexp(src[lane],&exp);
+                if (std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else if (std::isinf(src[lane])) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = 0;
+                    } else {
+                        vdst[lane] = UINT_MAX;
+                    }
+                } else if (exp > 31) {
+                    vdst[lane] = UINT_MAX;
+                } else {
+                    vdst[lane] = (VecElemU32)src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_I32_F32 class methods ---
+
+    Inst_VOP3__V_CVT_I32_F32::Inst_VOP3__V_CVT_I32_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_i32_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_I32_F32
+
+    Inst_VOP3__V_CVT_I32_F32::~Inst_VOP3__V_CVT_I32_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_I32_F32
+
+    // --- description from .arch file ---
+    // D.i = (int)S0.f.
+    // Out-of-range floating point values (including infinity) saturate. NaN is
+    // ---  converted to 0.
+    void
+    Inst_VOP3__V_CVT_I32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int exp;
+                std::frexp(src[lane],&exp);
+                if (std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else if (std::isinf(src[lane]) || exp > 30) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = INT_MIN;
+                    } else {
+                        vdst[lane] = INT_MAX;
+                    }
+                } else {
+                    vdst[lane] = (VecElemI32)src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MOV_FED_B32 class methods ---
+
+    Inst_VOP3__V_MOV_FED_B32::Inst_VOP3__V_MOV_FED_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mov_fed_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MOV_FED_B32
+
+    Inst_VOP3__V_MOV_FED_B32::~Inst_VOP3__V_MOV_FED_B32()
+    {
+    } // ~Inst_VOP3__V_MOV_FED_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u;
+    // Introduce EDC double error upon write to dest vgpr without causing an
+    // ---  exception.
+    // Input and output modifiers not supported; this is an untyped operation.
+    void
+    Inst_VOP3__V_MOV_FED_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F16_F32 class methods ---
+
+    Inst_VOP3__V_CVT_F16_F32::Inst_VOP3__V_CVT_F16_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f16_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_F16_F32
+
+    Inst_VOP3__V_CVT_F16_F32::~Inst_VOP3__V_CVT_F16_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_F16_F32
+
+    // --- description from .arch file ---
+    // D.f16 = flt32_to_flt16(S0.f).
+    // Supports input modifiers and creates FP16 denormals when appropriate.
+    void
+    Inst_VOP3__V_CVT_F16_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F32_F16 class methods ---
+
+    Inst_VOP3__V_CVT_F32_F16::Inst_VOP3__V_CVT_F32_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f32_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_F32_F16
+
+    Inst_VOP3__V_CVT_F32_F16::~Inst_VOP3__V_CVT_F32_F16()
+    {
+    } // ~Inst_VOP3__V_CVT_F32_F16
+
+    // --- description from .arch file ---
+    // D.f = flt16_to_flt32(S0.f16).
+    // FP16 denormal inputs are always accepted.
+    void
+    Inst_VOP3__V_CVT_F32_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_RPI_I32_F32 class methods ---
+
+    Inst_VOP3__V_CVT_RPI_I32_F32::Inst_VOP3__V_CVT_RPI_I32_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_rpi_i32_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_RPI_I32_F32
+
+    Inst_VOP3__V_CVT_RPI_I32_F32::~Inst_VOP3__V_CVT_RPI_I32_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_RPI_I32_F32
+
+    // --- description from .arch file ---
+    // D.i = (int)floor(S0.f + 0.5).
+    void
+    Inst_VOP3__V_CVT_RPI_I32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemI32)std::floor(src[lane] + 0.5);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_FLR_I32_F32 class methods ---
+
+    Inst_VOP3__V_CVT_FLR_I32_F32::Inst_VOP3__V_CVT_FLR_I32_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_flr_i32_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_FLR_I32_F32
+
+    Inst_VOP3__V_CVT_FLR_I32_F32::~Inst_VOP3__V_CVT_FLR_I32_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_FLR_I32_F32
+
+    // --- description from .arch file ---
+    // D.i = (int)floor(S0.f).
+    void
+    Inst_VOP3__V_CVT_FLR_I32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemI32)std::floor(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_OFF_F32_I4 class methods ---
+
+    Inst_VOP3__V_CVT_OFF_F32_I4::Inst_VOP3__V_CVT_OFF_F32_I4(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_off_f32_i4", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_OFF_F32_I4
+
+    Inst_VOP3__V_CVT_OFF_F32_I4::~Inst_VOP3__V_CVT_OFF_F32_I4()
+    {
+    } // ~Inst_VOP3__V_CVT_OFF_F32_I4
+
+    // --- description from .arch file ---
+    // 4-bit signed int to 32-bit float. Used for interpolation in shader.
+    void
+    Inst_VOP3__V_CVT_OFF_F32_I4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        // Could not parse sq_uc.arch desc field
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F32_F64 class methods ---
+
+    Inst_VOP3__V_CVT_F32_F64::Inst_VOP3__V_CVT_F32_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f32_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CVT_F32_F64
+
+    Inst_VOP3__V_CVT_F32_F64::~Inst_VOP3__V_CVT_F32_F64()
+    {
+    } // ~Inst_VOP3__V_CVT_F32_F64
+
+    // --- description from .arch file ---
+    // D.f = (float)S0.d.
+    void
+    Inst_VOP3__V_CVT_F32_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F64_F32 class methods ---
+
+    Inst_VOP3__V_CVT_F64_F32::Inst_VOP3__V_CVT_F64_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f64_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CVT_F64_F32
+
+    Inst_VOP3__V_CVT_F64_F32::~Inst_VOP3__V_CVT_F64_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_F64_F32
+
+    // --- description from .arch file ---
+    // D.d = (double)S0.f.
+    void
+    Inst_VOP3__V_CVT_F64_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF64)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F32_UBYTE0 class methods ---
+
+    Inst_VOP3__V_CVT_F32_UBYTE0::Inst_VOP3__V_CVT_F32_UBYTE0(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte0", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_F32_UBYTE0
+
+    Inst_VOP3__V_CVT_F32_UBYTE0::~Inst_VOP3__V_CVT_F32_UBYTE0()
+    {
+    } // ~Inst_VOP3__V_CVT_F32_UBYTE0
+
+    // --- description from .arch file ---
+    // D.f = (float)(S0.u[7:0]).
+    void
+    Inst_VOP3__V_CVT_F32_UBYTE0::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)bits(src[lane], 7, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F32_UBYTE1 class methods ---
+
+    Inst_VOP3__V_CVT_F32_UBYTE1::Inst_VOP3__V_CVT_F32_UBYTE1(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte1", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_F32_UBYTE1
+
+    Inst_VOP3__V_CVT_F32_UBYTE1::~Inst_VOP3__V_CVT_F32_UBYTE1()
+    {
+    } // ~Inst_VOP3__V_CVT_F32_UBYTE1
+
+    // --- description from .arch file ---
+    // D.f = (float)(S0.u[15:8]).
+    void
+    Inst_VOP3__V_CVT_F32_UBYTE1::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)bits(src[lane], 15, 8);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F32_UBYTE2 class methods ---
+
+    Inst_VOP3__V_CVT_F32_UBYTE2::Inst_VOP3__V_CVT_F32_UBYTE2(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte2", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_F32_UBYTE2
+
+    Inst_VOP3__V_CVT_F32_UBYTE2::~Inst_VOP3__V_CVT_F32_UBYTE2()
+    {
+    } // ~Inst_VOP3__V_CVT_F32_UBYTE2
+
+    // --- description from .arch file ---
+    // D.f = (float)(S0.u[23:16]).
+    void
+    Inst_VOP3__V_CVT_F32_UBYTE2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)bits(src[lane], 23, 16);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F32_UBYTE3 class methods ---
+
+    Inst_VOP3__V_CVT_F32_UBYTE3::Inst_VOP3__V_CVT_F32_UBYTE3(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte3", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_F32_UBYTE3
+
+    Inst_VOP3__V_CVT_F32_UBYTE3::~Inst_VOP3__V_CVT_F32_UBYTE3()
+    {
+    } // ~Inst_VOP3__V_CVT_F32_UBYTE3
+
+    // --- description from .arch file ---
+    // D.f = (float)(S0.u[31:24]).
+    void
+    Inst_VOP3__V_CVT_F32_UBYTE3::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)bits(src[lane], 31, 24);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_U32_F64 class methods ---
+
+    Inst_VOP3__V_CVT_U32_F64::Inst_VOP3__V_CVT_U32_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_u32_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CVT_U32_F64
+
+    Inst_VOP3__V_CVT_U32_F64::~Inst_VOP3__V_CVT_U32_F64()
+    {
+    } // ~Inst_VOP3__V_CVT_U32_F64
+
+    // --- description from .arch file ---
+    // D.u = (unsigned)S0.d.
+    // Out-of-range floating point values (including infinity) saturate. NaN is
+    // ---  converted to 0.
+    void
+    Inst_VOP3__V_CVT_U32_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int exp;
+                std::frexp(src[lane],&exp);
+                if (std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else if (std::isinf(src[lane])) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = 0;
+                    } else {
+                        vdst[lane] = UINT_MAX;
+                    }
+                } else if (exp > 31) {
+                    vdst[lane] = UINT_MAX;
+                } else {
+                    vdst[lane] = (VecElemU32)src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F64_U32 class methods ---
+
+    Inst_VOP3__V_CVT_F64_U32::Inst_VOP3__V_CVT_F64_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f64_u32", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CVT_F64_U32
+
+    Inst_VOP3__V_CVT_F64_U32::~Inst_VOP3__V_CVT_F64_U32()
+    {
+    } // ~Inst_VOP3__V_CVT_F64_U32
+
+    // --- description from .arch file ---
+    // D.d = (double)S0.u.
+    void
+    Inst_VOP3__V_CVT_F64_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF64)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_TRUNC_F64 class methods ---
+
+    Inst_VOP3__V_TRUNC_F64::Inst_VOP3__V_TRUNC_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_trunc_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_TRUNC_F64
+
+    Inst_VOP3__V_TRUNC_F64::~Inst_VOP3__V_TRUNC_F64()
+    {
+    } // ~Inst_VOP3__V_TRUNC_F64
+
+    // --- description from .arch file ---
+    // D.d = trunc(S0.d), return integer part of S0.d.
+    void
+    Inst_VOP3__V_TRUNC_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::trunc(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CEIL_F64 class methods ---
+
+    Inst_VOP3__V_CEIL_F64::Inst_VOP3__V_CEIL_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ceil_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CEIL_F64
+
+    Inst_VOP3__V_CEIL_F64::~Inst_VOP3__V_CEIL_F64()
+    {
+    } // ~Inst_VOP3__V_CEIL_F64
+
+    // --- description from .arch file ---
+    // D.d = trunc(S0.d);
+    // if (S0.d > 0.0 && S0.d != D.d) then D.d += 1.0.
+    void
+    Inst_VOP3__V_CEIL_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::ceil(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_RNDNE_F64 class methods ---
+
+    Inst_VOP3__V_RNDNE_F64::Inst_VOP3__V_RNDNE_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rndne_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_RNDNE_F64
+
+    Inst_VOP3__V_RNDNE_F64::~Inst_VOP3__V_RNDNE_F64()
+    {
+    } // ~Inst_VOP3__V_RNDNE_F64
+
+    // --- description from .arch file ---
+    // D.d = round_nearest_even(S0.d).
+    void
+    Inst_VOP3__V_RNDNE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = roundNearestEven(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FLOOR_F64 class methods ---
+
+    Inst_VOP3__V_FLOOR_F64::Inst_VOP3__V_FLOOR_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_floor_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_FLOOR_F64
+
+    Inst_VOP3__V_FLOOR_F64::~Inst_VOP3__V_FLOOR_F64()
+    {
+    } // ~Inst_VOP3__V_FLOOR_F64
+
+    // --- description from .arch file ---
+    // D.d = trunc(S0.d);
+    // if (S0.d < 0.0 && S0.d != D.d) then D.d += -1.0.
+    void
+    Inst_VOP3__V_FLOOR_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::floor(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FRACT_F32 class methods ---
+
+    Inst_VOP3__V_FRACT_F32::Inst_VOP3__V_FRACT_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_fract_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_FRACT_F32
+
+    Inst_VOP3__V_FRACT_F32::~Inst_VOP3__V_FRACT_F32()
+    {
+    } // ~Inst_VOP3__V_FRACT_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f - floor(S0.f).
+    void
+    Inst_VOP3__V_FRACT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemF32 int_part(0.0);
+                vdst[lane] = std::modf(src[lane], &int_part);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_TRUNC_F32 class methods ---
+
+    Inst_VOP3__V_TRUNC_F32::Inst_VOP3__V_TRUNC_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_trunc_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_TRUNC_F32
+
+    Inst_VOP3__V_TRUNC_F32::~Inst_VOP3__V_TRUNC_F32()
+    {
+    } // ~Inst_VOP3__V_TRUNC_F32
+
+    // --- description from .arch file ---
+    // D.f = trunc(S0.f), return integer part of S0.f.
+    void
+    Inst_VOP3__V_TRUNC_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::trunc(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CEIL_F32 class methods ---
+
+    Inst_VOP3__V_CEIL_F32::Inst_VOP3__V_CEIL_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ceil_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CEIL_F32
+
+    Inst_VOP3__V_CEIL_F32::~Inst_VOP3__V_CEIL_F32()
+    {
+    } // ~Inst_VOP3__V_CEIL_F32
+
+    // --- description from .arch file ---
+    // D.f = trunc(S0.f);
+    // if (S0.f > 0.0 && S0.f != D.f) then D.f += 1.0.
+    void
+    Inst_VOP3__V_CEIL_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::ceil(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_RNDNE_F32 class methods ---
+
+    Inst_VOP3__V_RNDNE_F32::Inst_VOP3__V_RNDNE_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rndne_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_RNDNE_F32
+
+    Inst_VOP3__V_RNDNE_F32::~Inst_VOP3__V_RNDNE_F32()
+    {
+    } // ~Inst_VOP3__V_RNDNE_F32
+
+    // --- description from .arch file ---
+    // D.f = round_nearest_even(S0.f).
+    void
+    Inst_VOP3__V_RNDNE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = roundNearestEven(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FLOOR_F32 class methods ---
+
+    Inst_VOP3__V_FLOOR_F32::Inst_VOP3__V_FLOOR_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_floor_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_FLOOR_F32
+
+    Inst_VOP3__V_FLOOR_F32::~Inst_VOP3__V_FLOOR_F32()
+    {
+    } // ~Inst_VOP3__V_FLOOR_F32
+
+    // --- description from .arch file ---
+    // D.f = trunc(S0.f);
+    // if (S0.f < 0.0 && S0.f != D.f) then D.f += -1.0.
+    void
+    Inst_VOP3__V_FLOOR_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::floor(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_EXP_F32 class methods ---
+
+    Inst_VOP3__V_EXP_F32::Inst_VOP3__V_EXP_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_exp_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_EXP_F32
+
+    Inst_VOP3__V_EXP_F32::~Inst_VOP3__V_EXP_F32()
+    {
+    } // ~Inst_VOP3__V_EXP_F32
+
+    // --- description from .arch file ---
+    // D.f = pow(2.0, S0.f).
+    void
+    Inst_VOP3__V_EXP_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::pow(2.0, src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LOG_F32 class methods ---
+
+    Inst_VOP3__V_LOG_F32::Inst_VOP3__V_LOG_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_log_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_LOG_F32
+
+    Inst_VOP3__V_LOG_F32::~Inst_VOP3__V_LOG_F32()
+    {
+    } // ~Inst_VOP3__V_LOG_F32
+
+    // --- description from .arch file ---
+    // D.f = log2(S0.f). Base 2 logarithm.
+    void
+    Inst_VOP3__V_LOG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::log2(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_RCP_F32 class methods ---
+
+    Inst_VOP3__V_RCP_F32::Inst_VOP3__V_RCP_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rcp_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_RCP_F32
+
+    Inst_VOP3__V_RCP_F32::~Inst_VOP3__V_RCP_F32()
+    {
+    } // ~Inst_VOP3__V_RCP_F32
+
+    // --- description from .arch file ---
+    // D.f = 1.0 / S0.f. Reciprocal with IEEE rules and < 1ulp error.
+    void
+    Inst_VOP3__V_RCP_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = 1.0 / src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_RCP_IFLAG_F32 class methods ---
+
+    Inst_VOP3__V_RCP_IFLAG_F32::Inst_VOP3__V_RCP_IFLAG_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rcp_iflag_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_RCP_IFLAG_F32
+
+    Inst_VOP3__V_RCP_IFLAG_F32::~Inst_VOP3__V_RCP_IFLAG_F32()
+    {
+    } // ~Inst_VOP3__V_RCP_IFLAG_F32
+
+    // --- description from .arch file ---
+    // D.f = 1.0 / S0.f. Reciprocal intended for integer division, can raise
+    // ---  integer DIV_BY_ZERO exception but cannot raise floating-point
+    // ---  exceptions.
+    void
+    Inst_VOP3__V_RCP_IFLAG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = 1.0 / src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_RSQ_F32 class methods ---
+
+    Inst_VOP3__V_RSQ_F32::Inst_VOP3__V_RSQ_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rsq_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_RSQ_F32
+
+    Inst_VOP3__V_RSQ_F32::~Inst_VOP3__V_RSQ_F32()
+    {
+    } // ~Inst_VOP3__V_RSQ_F32
+
+    // --- description from .arch file ---
+    // D.f = 1.0 / sqrt(S0.f). Reciprocal square root with IEEE rules.
+    void
+    Inst_VOP3__V_RSQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = 1.0 / std::sqrt(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_RCP_F64 class methods ---
+
+    Inst_VOP3__V_RCP_F64::Inst_VOP3__V_RCP_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rcp_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_RCP_F64
+
+    Inst_VOP3__V_RCP_F64::~Inst_VOP3__V_RCP_F64()
+    {
+    } // ~Inst_VOP3__V_RCP_F64
+
+    // --- description from .arch file ---
+    // D.d = 1.0 / S0.d.
+    void
+    Inst_VOP3__V_RCP_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::fpclassify(src[lane]) == FP_ZERO) {
+                    vdst[lane] = +INFINITY;
+                } else if (std::isnan(src[lane])) {
+                    vdst[lane] = NAN;
+                } else if (std::isinf(src[lane])) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = -0.0;
+                    } else {
+                        vdst[lane] = 0.0;
+                    }
+                } else {
+                    vdst[lane] = 1.0 / src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_RSQ_F64 class methods ---
+
+    Inst_VOP3__V_RSQ_F64::Inst_VOP3__V_RSQ_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rsq_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_RSQ_F64
+
+    Inst_VOP3__V_RSQ_F64::~Inst_VOP3__V_RSQ_F64()
+    {
+    } // ~Inst_VOP3__V_RSQ_F64
+
+    // --- description from .arch file ---
+    // D.d = 1.0 / sqrt(S0.d). See V_RSQ_F32.
+    void
+    Inst_VOP3__V_RSQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::fpclassify(src[lane]) == FP_ZERO) {
+                    vdst[lane] = +INFINITY;
+                } else if (std::isnan(src[lane])) {
+                    vdst[lane] = NAN;
+                } else if (std::isinf(src[lane]) && !std::signbit(src[lane])) {
+                    vdst[lane] = 0.0;
+                } else if (std::signbit(src[lane])) {
+                    vdst[lane] = NAN;
+                } else {
+                    vdst[lane] = 1.0 / std::sqrt(src[lane]);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SQRT_F32 class methods ---
+
+    Inst_VOP3__V_SQRT_F32::Inst_VOP3__V_SQRT_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sqrt_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_SQRT_F32
+
+    Inst_VOP3__V_SQRT_F32::~Inst_VOP3__V_SQRT_F32()
+    {
+    } // ~Inst_VOP3__V_SQRT_F32
+
+    // --- description from .arch file ---
+    // D.f = sqrt(S0.f).
+    void
+    Inst_VOP3__V_SQRT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::sqrt(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SQRT_F64 class methods ---
+
+    Inst_VOP3__V_SQRT_F64::Inst_VOP3__V_SQRT_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sqrt_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_SQRT_F64
+
+    Inst_VOP3__V_SQRT_F64::~Inst_VOP3__V_SQRT_F64()
+    {
+    } // ~Inst_VOP3__V_SQRT_F64
+
+    // --- description from .arch file ---
+    // D.d = sqrt(S0.d).
+    void
+    Inst_VOP3__V_SQRT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::sqrt(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SIN_F32 class methods ---
+
+    Inst_VOP3__V_SIN_F32::Inst_VOP3__V_SIN_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sin_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_SIN_F32
+
+    Inst_VOP3__V_SIN_F32::~Inst_VOP3__V_SIN_F32()
+    {
+    } // ~Inst_VOP3__V_SIN_F32
+
+    // --- description from .arch file ---
+    // D.f = sin(S0.f * 2 * PI).
+    // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
+    // float 0.0.
+    void
+    Inst_VOP3__V_SIN_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+        pi.read();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::sin(src[lane] * 2 * pi.rawData());
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_COS_F32 class methods ---
+
+    Inst_VOP3__V_COS_F32::Inst_VOP3__V_COS_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cos_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_COS_F32
+
+    Inst_VOP3__V_COS_F32::~Inst_VOP3__V_COS_F32()
+    {
+    } // ~Inst_VOP3__V_COS_F32
+
+    // --- description from .arch file ---
+    // D.f = cos(S0.f * 2 * PI).
+    // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
+    // float 1.0.
+    void
+    Inst_VOP3__V_COS_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+        pi.read();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::cos(src[lane] * 2 * pi.rawData());
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_NOT_B32 class methods ---
+
+    Inst_VOP3__V_NOT_B32::Inst_VOP3__V_NOT_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_not_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_NOT_B32
+
+    Inst_VOP3__V_NOT_B32::~Inst_VOP3__V_NOT_B32()
+    {
+    } // ~Inst_VOP3__V_NOT_B32
+
+    // --- description from .arch file ---
+    // D.u = ~S0.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP3__V_NOT_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = ~src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_BFREV_B32 class methods ---
+
+    Inst_VOP3__V_BFREV_B32::Inst_VOP3__V_BFREV_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_bfrev_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_BFREV_B32
+
+    Inst_VOP3__V_BFREV_B32::~Inst_VOP3__V_BFREV_B32()
+    {
+    } // ~Inst_VOP3__V_BFREV_B32
+
+    // --- description from .arch file ---
+    // D.u[31:0] = S0.u[0:31], bitfield reverse.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP3__V_BFREV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = reverseBits(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FFBH_U32 class methods ---
+
+    Inst_VOP3__V_FFBH_U32::Inst_VOP3__V_FFBH_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ffbh_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_FFBH_U32
+
+    Inst_VOP3__V_FFBH_U32::~Inst_VOP3__V_FFBH_U32()
+    {
+    } // ~Inst_VOP3__V_FFBH_U32
+
+    // --- description from .arch file ---
+    // D.u = position of first 1 in S0.u from MSB;
+    // D.u = 0xffffffff if S0.u == 0.
+    void
+    Inst_VOP3__V_FFBH_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = findFirstOneMsb(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FFBL_B32 class methods ---
+
+    Inst_VOP3__V_FFBL_B32::Inst_VOP3__V_FFBL_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ffbl_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_FFBL_B32
+
+    Inst_VOP3__V_FFBL_B32::~Inst_VOP3__V_FFBL_B32()
+    {
+    } // ~Inst_VOP3__V_FFBL_B32
+
+    // --- description from .arch file ---
+    // D.u = position of first 1 in S0.u from LSB;
+    // D.u = 0xffffffff if S0.u == 0.
+    void
+    Inst_VOP3__V_FFBL_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = findFirstOne(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FFBH_I32 class methods ---
+
+    Inst_VOP3__V_FFBH_I32::Inst_VOP3__V_FFBH_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ffbh_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_FFBH_I32
+
+    Inst_VOP3__V_FFBH_I32::~Inst_VOP3__V_FFBH_I32()
+    {
+    } // ~Inst_VOP3__V_FFBH_I32
+
+    // --- description from .arch file ---
+    // D.u = position of first bit different from sign bit in S0.i from MSB;
+    // D.u = 0xffffffff if S0.i == 0 or S0.i == 0xffffffff.
+    void
+    Inst_VOP3__V_FFBH_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = firstOppositeSignBit(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FREXP_EXP_I32_F64 class methods ---
+
+    Inst_VOP3__V_FREXP_EXP_I32_F64::Inst_VOP3__V_FREXP_EXP_I32_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_frexp_exp_i32_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_FREXP_EXP_I32_F64
+
+    Inst_VOP3__V_FREXP_EXP_I32_F64::~Inst_VOP3__V_FREXP_EXP_I32_F64()
+    {
+    } // ~Inst_VOP3__V_FREXP_EXP_I32_F64
+
+    // --- description from .arch file ---
+    // See V_FREXP_EXP_I32_F32.
+    void
+    Inst_VOP3__V_FREXP_EXP_I32_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else {
+                    VecElemI32 exp(0);
+                    std::frexp(src[lane], &exp);
+                    vdst[lane] = exp;
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FREXP_MANT_F64 class methods ---
+
+    Inst_VOP3__V_FREXP_MANT_F64::Inst_VOP3__V_FREXP_MANT_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_frexp_mant_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_FREXP_MANT_F64
+
+    Inst_VOP3__V_FREXP_MANT_F64::~Inst_VOP3__V_FREXP_MANT_F64()
+    {
+    } // ~Inst_VOP3__V_FREXP_MANT_F64
+
+    // --- description from .arch file ---
+    // See V_FREXP_MANT_F32.
+    void
+    Inst_VOP3__V_FREXP_MANT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemI32 exp(0);
+                vdst[lane] = std::frexp(src[lane], &exp);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FRACT_F64 class methods ---
+
+    Inst_VOP3__V_FRACT_F64::Inst_VOP3__V_FRACT_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_fract_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_FRACT_F64
+
+    Inst_VOP3__V_FRACT_F64::~Inst_VOP3__V_FRACT_F64()
+    {
+    } // ~Inst_VOP3__V_FRACT_F64
+
+    // --- description from .arch file ---
+    // See V_FRACT_F32.
+    void
+    Inst_VOP3__V_FRACT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemF32 int_part(0.0);
+                vdst[lane] = std::modf(src[lane], &int_part);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FREXP_EXP_I32_F32 class methods ---
+
+    Inst_VOP3__V_FREXP_EXP_I32_F32::Inst_VOP3__V_FREXP_EXP_I32_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_frexp_exp_i32_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_FREXP_EXP_I32_F32
+
+    Inst_VOP3__V_FREXP_EXP_I32_F32::~Inst_VOP3__V_FREXP_EXP_I32_F32()
+    {
+    } // ~Inst_VOP3__V_FREXP_EXP_I32_F32
+
+    // --- description from .arch file ---
+    // if (S0.f == INF || S0.f == NAN) then D.i = 0;
+    // else D.i = TwosComplement(Exponent(S0.f) - 127 + 1).
+    // Returns exponent of single precision float input, such that S0.f =
+    // significand * (2 ** exponent). See also FREXP_MANT_F32, which returns
+    // the significand.
+    void
+    Inst_VOP3__V_FREXP_EXP_I32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isinf(src[lane])|| std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else {
+                    VecElemI32 exp(0);
+                    std::frexp(src[lane], &exp);
+                    vdst[lane] = exp;
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FREXP_MANT_F32 class methods ---
+
+    Inst_VOP3__V_FREXP_MANT_F32::Inst_VOP3__V_FREXP_MANT_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_frexp_mant_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_FREXP_MANT_F32
+
+    Inst_VOP3__V_FREXP_MANT_F32::~Inst_VOP3__V_FREXP_MANT_F32()
+    {
+    } // ~Inst_VOP3__V_FREXP_MANT_F32
+
+    // --- description from .arch file ---
+    // if (S0.f == INF || S0.f == NAN) then D.f = S0.f;
+    // else D.f = Mantissa(S0.f).
+    // Result range is in (-1.0,-0.5][0.5,1.0) in normal cases. Returns binary
+    // ---  significand of single precision float input, such that S0.f =
+    // ---  significand * (2 ** exponent). See also FREXP_EXP_I32_F32, which
+    // ---  returns integer exponent.
+    void
+    Inst_VOP3__V_FREXP_MANT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
+                    vdst[lane] = src[lane];
+                } else {
+                    VecElemI32 exp(0);
+                    vdst[lane] = std::frexp(src[lane], &exp);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CLREXCP class methods ---
+
+    Inst_VOP3__V_CLREXCP::Inst_VOP3__V_CLREXCP(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_clrexcp", false)
+    {
+    } // Inst_VOP3__V_CLREXCP
+
+    Inst_VOP3__V_CLREXCP::~Inst_VOP3__V_CLREXCP()
+    {
+    } // ~Inst_VOP3__V_CLREXCP
+
+    // --- description from .arch file ---
+    // Clear wave's exception state in SIMD (SP).
+    void
+    Inst_VOP3__V_CLREXCP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F16_U16 class methods ---
+
+    Inst_VOP3__V_CVT_F16_U16::Inst_VOP3__V_CVT_F16_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f16_u16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CVT_F16_U16
+
+    Inst_VOP3__V_CVT_F16_U16::~Inst_VOP3__V_CVT_F16_U16()
+    {
+    } // ~Inst_VOP3__V_CVT_F16_U16
+
+    // --- description from .arch file ---
+    // D.f16 = uint16_to_flt16(S.u16).
+    // Supports denormals, rounding, exception flags and saturation.
+    void
+    Inst_VOP3__V_CVT_F16_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F16_I16 class methods ---
+
+    Inst_VOP3__V_CVT_F16_I16::Inst_VOP3__V_CVT_F16_I16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f16_i16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CVT_F16_I16
+
+    Inst_VOP3__V_CVT_F16_I16::~Inst_VOP3__V_CVT_F16_I16()
+    {
+    } // ~Inst_VOP3__V_CVT_F16_I16
+
+    // --- description from .arch file ---
+    // D.f16 = int16_to_flt16(S.i16).
+    // Supports denormals, rounding, exception flags and saturation.
+    void
+    Inst_VOP3__V_CVT_F16_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_U16_F16 class methods ---
+
+    Inst_VOP3__V_CVT_U16_F16::Inst_VOP3__V_CVT_U16_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_u16_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CVT_U16_F16
+
+    Inst_VOP3__V_CVT_U16_F16::~Inst_VOP3__V_CVT_U16_F16()
+    {
+    } // ~Inst_VOP3__V_CVT_U16_F16
+
+    // --- description from .arch file ---
+    // D.u16 = flt16_to_uint16(S.f16).
+    // Supports rounding, exception flags and saturation.
+    void
+    Inst_VOP3__V_CVT_U16_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_I16_F16 class methods ---
+
+    Inst_VOP3__V_CVT_I16_F16::Inst_VOP3__V_CVT_I16_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_i16_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CVT_I16_F16
+
+    Inst_VOP3__V_CVT_I16_F16::~Inst_VOP3__V_CVT_I16_F16()
+    {
+    } // ~Inst_VOP3__V_CVT_I16_F16
+
+    // --- description from .arch file ---
+    // D.i16 = flt16_to_int16(S.f16).
+    // Supports rounding, exception flags and saturation.
+    void
+    Inst_VOP3__V_CVT_I16_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_RCP_F16 class methods ---
+
+    Inst_VOP3__V_RCP_F16::Inst_VOP3__V_RCP_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rcp_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_RCP_F16
+
+    Inst_VOP3__V_RCP_F16::~Inst_VOP3__V_RCP_F16()
+    {
+    } // ~Inst_VOP3__V_RCP_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 1.0f)
+    //     D.f16 = 1.0f;
+    // else
+    //     D.f16 = ApproximateRecip(S0.f16).
+    void
+    Inst_VOP3__V_RCP_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_SQRT_F16 class methods ---
+
+    Inst_VOP3__V_SQRT_F16::Inst_VOP3__V_SQRT_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sqrt_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_SQRT_F16
+
+    Inst_VOP3__V_SQRT_F16::~Inst_VOP3__V_SQRT_F16()
+    {
+    } // ~Inst_VOP3__V_SQRT_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 1.0f)
+    //     D.f16 = 1.0f;
+    // else
+    //     D.f16 = ApproximateSqrt(S0.f16).
+    void
+    Inst_VOP3__V_SQRT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_RSQ_F16 class methods ---
+
+    Inst_VOP3__V_RSQ_F16::Inst_VOP3__V_RSQ_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rsq_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_RSQ_F16
+
+    Inst_VOP3__V_RSQ_F16::~Inst_VOP3__V_RSQ_F16()
+    {
+    } // ~Inst_VOP3__V_RSQ_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 1.0f)
+    //     D.f16 = 1.0f;
+    // else
+    //     D.f16 = ApproximateRecipSqrt(S0.f16).
+    void
+    Inst_VOP3__V_RSQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_LOG_F16 class methods ---
+
+    Inst_VOP3__V_LOG_F16::Inst_VOP3__V_LOG_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_log_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_LOG_F16
+
+    Inst_VOP3__V_LOG_F16::~Inst_VOP3__V_LOG_F16()
+    {
+    } // ~Inst_VOP3__V_LOG_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 1.0f)
+    //     D.f16 = 0.0f;
+    // else
+    //     D.f16 = ApproximateLog2(S0.f16).
+    void
+    Inst_VOP3__V_LOG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_EXP_F16 class methods ---
+
+    Inst_VOP3__V_EXP_F16::Inst_VOP3__V_EXP_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_exp_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_EXP_F16
+
+    Inst_VOP3__V_EXP_F16::~Inst_VOP3__V_EXP_F16()
+    {
+    } // ~Inst_VOP3__V_EXP_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 0.0f)
+    //     D.f16 = 1.0f;
+    // else
+    //     D.f16 = Approximate2ToX(S0.f16).
+    void
+    Inst_VOP3__V_EXP_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_FREXP_MANT_F16 class methods ---
+
+    Inst_VOP3__V_FREXP_MANT_F16::Inst_VOP3__V_FREXP_MANT_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_frexp_mant_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_FREXP_MANT_F16
+
+    Inst_VOP3__V_FREXP_MANT_F16::~Inst_VOP3__V_FREXP_MANT_F16()
+    {
+    } // ~Inst_VOP3__V_FREXP_MANT_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == +-INF || S0.f16 == NAN)
+    //     D.f16 = S0.f16;
+    // else
+    //     D.f16 = mantissa(S0.f16).
+    // Result range is (-1.0,-0.5][0.5,1.0).
+    // C math library frexp function.
+    // Returns binary significand of half precision float input, such that the
+    // original single float = significand * (2 ** exponent).
+    void
+    Inst_VOP3__V_FREXP_MANT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_FREXP_EXP_I16_F16 class methods ---
+
+    Inst_VOP3__V_FREXP_EXP_I16_F16::Inst_VOP3__V_FREXP_EXP_I16_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_frexp_exp_i16_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_FREXP_EXP_I16_F16
+
+    Inst_VOP3__V_FREXP_EXP_I16_F16::~Inst_VOP3__V_FREXP_EXP_I16_F16()
+    {
+    } // ~Inst_VOP3__V_FREXP_EXP_I16_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == +-INF || S0.f16 == NAN)
+    //     D.i16 = 0;
+    // else
+    //     D.i16 = 2s_complement(exponent(S0.f16) - 15 + 1).
+    // C math library frexp function.
+    // Returns exponent of half precision float input, such that the
+    // original single float = significand * (2 ** exponent).
+    void
+    Inst_VOP3__V_FREXP_EXP_I16_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_FLOOR_F16 class methods ---
+
+    Inst_VOP3__V_FLOOR_F16::Inst_VOP3__V_FLOOR_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_floor_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_FLOOR_F16
+
+    Inst_VOP3__V_FLOOR_F16::~Inst_VOP3__V_FLOOR_F16()
+    {
+    } // ~Inst_VOP3__V_FLOOR_F16
+
+    // --- description from .arch file ---
+    // D.f16 = trunc(S0.f16);
+    // if (S0.f16 < 0.0f && S0.f16 != D.f16) then D.f16 -= 1.0f.
+    void
+    Inst_VOP3__V_FLOOR_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CEIL_F16 class methods ---
+
+    Inst_VOP3__V_CEIL_F16::Inst_VOP3__V_CEIL_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ceil_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CEIL_F16
+
+    Inst_VOP3__V_CEIL_F16::~Inst_VOP3__V_CEIL_F16()
+    {
+    } // ~Inst_VOP3__V_CEIL_F16
+
+    // --- description from .arch file ---
+    // D.f16 = trunc(S0.f16);
+    // if (S0.f16 > 0.0f && S0.f16 != D.f16) then D.f16 += 1.0f.
+    void
+    Inst_VOP3__V_CEIL_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_TRUNC_F16 class methods ---
+
+    Inst_VOP3__V_TRUNC_F16::Inst_VOP3__V_TRUNC_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_trunc_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_TRUNC_F16
+
+    Inst_VOP3__V_TRUNC_F16::~Inst_VOP3__V_TRUNC_F16()
+    {
+    } // ~Inst_VOP3__V_TRUNC_F16
+
+    // --- description from .arch file ---
+    // D.f16 = trunc(S0.f16).
+    // Round-to-zero semantics.
+    void
+    Inst_VOP3__V_TRUNC_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_RNDNE_F16 class methods ---
+
+    Inst_VOP3__V_RNDNE_F16::Inst_VOP3__V_RNDNE_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rndne_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_RNDNE_F16
+
+    Inst_VOP3__V_RNDNE_F16::~Inst_VOP3__V_RNDNE_F16()
+    {
+    } // ~Inst_VOP3__V_RNDNE_F16
+
+    // --- description from .arch file ---
+    // D.f16 = FLOOR(S0.f16 + 0.5f);
+    // if (floor(S0.f16) is even && fract(S0.f16) == 0.5f) then D.f16 -= 1.0f.
+    // Round-to-nearest-even semantics.
+    void
+    Inst_VOP3__V_RNDNE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_FRACT_F16 class methods ---
+
+    Inst_VOP3__V_FRACT_F16::Inst_VOP3__V_FRACT_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_fract_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_FRACT_F16
+
+    Inst_VOP3__V_FRACT_F16::~Inst_VOP3__V_FRACT_F16()
+    {
+    } // ~Inst_VOP3__V_FRACT_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 + -floor(S0.f16).
+    void
+    Inst_VOP3__V_FRACT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_SIN_F16 class methods ---
+
+    Inst_VOP3__V_SIN_F16::Inst_VOP3__V_SIN_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sin_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_SIN_F16
+
+    Inst_VOP3__V_SIN_F16::~Inst_VOP3__V_SIN_F16()
+    {
+    } // ~Inst_VOP3__V_SIN_F16
+
+    // --- description from .arch file ---
+    // D.f16 = sin(S0.f16 * 2 * PI).
+    void
+    Inst_VOP3__V_SIN_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_COS_F16 class methods ---
+
+    Inst_VOP3__V_COS_F16::Inst_VOP3__V_COS_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cos_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_COS_F16
+
+    Inst_VOP3__V_COS_F16::~Inst_VOP3__V_COS_F16()
+    {
+    } // ~Inst_VOP3__V_COS_F16
+
+    // --- description from .arch file ---
+    // D.f16 = cos(S0.f16 * 2 * PI).
+    void
+    Inst_VOP3__V_COS_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_EXP_LEGACY_F32 class methods ---
+
+    Inst_VOP3__V_EXP_LEGACY_F32::Inst_VOP3__V_EXP_LEGACY_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_exp_legacy_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_EXP_LEGACY_F32
+
+    Inst_VOP3__V_EXP_LEGACY_F32::~Inst_VOP3__V_EXP_LEGACY_F32()
+    {
+    } // ~Inst_VOP3__V_EXP_LEGACY_F32
+
+    // --- description from .arch file ---
+    // D.f = pow(2.0, S0.f) with legacy semantics.
+    void
+    Inst_VOP3__V_EXP_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::pow(2.0, src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LOG_LEGACY_F32 class methods ---
+
+    Inst_VOP3__V_LOG_LEGACY_F32::Inst_VOP3__V_LOG_LEGACY_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_log_legacy_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_LOG_LEGACY_F32
+
+    Inst_VOP3__V_LOG_LEGACY_F32::~Inst_VOP3__V_LOG_LEGACY_F32()
+    {
+    } // ~Inst_VOP3__V_LOG_LEGACY_F32
+
+    // --- description from .arch file ---
+    // D.f = log2(S0.f). Base 2 logarithm with legacy semantics.
+    void
+    Inst_VOP3__V_LOG_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::log2(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAD_LEGACY_F32 class methods ---
+
+    Inst_VOP3__V_MAD_LEGACY_F32::Inst_VOP3__V_MAD_LEGACY_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mad_legacy_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(MAD);
+    } // Inst_VOP3__V_MAD_LEGACY_F32
+
+    Inst_VOP3__V_MAD_LEGACY_F32::~Inst_VOP3__V_MAD_LEGACY_F32()
+    {
+    } // ~Inst_VOP3__V_MAD_LEGACY_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f + S2.f (DX9 rules, 0.0 * x = 0.0).
+    void
+    Inst_VOP3__V_MAD_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAD_F32 class methods ---
+
+    Inst_VOP3__V_MAD_F32::Inst_VOP3__V_MAD_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mad_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(MAD);
+    } // Inst_VOP3__V_MAD_F32
+
+    Inst_VOP3__V_MAD_F32::~Inst_VOP3__V_MAD_F32()
+    {
+    } // ~Inst_VOP3__V_MAD_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f + S2.f.
+    void
+    Inst_VOP3__V_MAD_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAD_I32_I24 class methods ---
+
+    Inst_VOP3__V_MAD_I32_I24::Inst_VOP3__V_MAD_I32_I24(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mad_i32_i24", false)
+    {
+        setFlag(ALU);
+        setFlag(MAD);
+    } // Inst_VOP3__V_MAD_I32_I24
+
+    Inst_VOP3__V_MAD_I32_I24::~Inst_VOP3__V_MAD_I32_I24()
+    {
+    } // ~Inst_VOP3__V_MAD_I32_I24
+
+    // --- description from .arch file ---
+    // D.i = S0.i[23:0] * S1.i[23:0] + S2.i.
+    void
+    Inst_VOP3__V_MAD_I32_I24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
+                    * sext<24>(bits(src1[lane], 23, 0)) + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAD_U32_U24 class methods ---
+
+    Inst_VOP3__V_MAD_U32_U24::Inst_VOP3__V_MAD_U32_U24(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mad_u32_u24", false)
+    {
+        setFlag(ALU);
+        setFlag(MAD);
+    } // Inst_VOP3__V_MAD_U32_U24
+
+    Inst_VOP3__V_MAD_U32_U24::~Inst_VOP3__V_MAD_U32_U24()
+    {
+    } // ~Inst_VOP3__V_MAD_U32_U24
+
+    // --- description from .arch file ---
+    // D.u = S0.u[23:0] * S1.u[23:0] + S2.u.
+    void
+    Inst_VOP3__V_MAD_U32_U24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0)
+                    + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CUBEID_F32 class methods ---
+
+    Inst_VOP3__V_CUBEID_F32::Inst_VOP3__V_CUBEID_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cubeid_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CUBEID_F32
+
+    Inst_VOP3__V_CUBEID_F32::~Inst_VOP3__V_CUBEID_F32()
+    {
+    } // ~Inst_VOP3__V_CUBEID_F32
+
+    // --- description from .arch file ---
+    // D.f = cubemap face ID ({0.0, 1.0, ..., 5.0}). XYZ coordinate is given in
+    // ---  (S0.f, S1.f, S2.f).
+    void
+    Inst_VOP3__V_CUBEID_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CUBESC_F32 class methods ---
+
+    Inst_VOP3__V_CUBESC_F32::Inst_VOP3__V_CUBESC_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cubesc_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CUBESC_F32
+
+    Inst_VOP3__V_CUBESC_F32::~Inst_VOP3__V_CUBESC_F32()
+    {
+    } // ~Inst_VOP3__V_CUBESC_F32
+
+    // --- description from .arch file ---
+    // D.f = cubemap S coordinate. XYZ coordinate is given in (S0.f, S1.f,
+    // S2.f).
+    void
+    Inst_VOP3__V_CUBESC_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CUBETC_F32 class methods ---
+
+    Inst_VOP3__V_CUBETC_F32::Inst_VOP3__V_CUBETC_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cubetc_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CUBETC_F32
+
+    Inst_VOP3__V_CUBETC_F32::~Inst_VOP3__V_CUBETC_F32()
+    {
+    } // ~Inst_VOP3__V_CUBETC_F32
+
+    // --- description from .arch file ---
+    // D.f = cubemap T coordinate. XYZ coordinate is given in (S0.f, S1.f,
+    // S2.f).
+    void
+    Inst_VOP3__V_CUBETC_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CUBEMA_F32 class methods ---
+
+    Inst_VOP3__V_CUBEMA_F32::Inst_VOP3__V_CUBEMA_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cubema_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CUBEMA_F32
+
+    Inst_VOP3__V_CUBEMA_F32::~Inst_VOP3__V_CUBEMA_F32()
+    {
+    } // ~Inst_VOP3__V_CUBEMA_F32
+
+    // --- description from .arch file ---
+    // D.f = 2.0 * cubemap major axis. XYZ coordinate is given in (S0.f, S1.f,
+    // ---  S2.f).
+    void
+    Inst_VOP3__V_CUBEMA_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_BFE_U32 class methods ---
+
+    Inst_VOP3__V_BFE_U32::Inst_VOP3__V_BFE_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_bfe_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_BFE_U32
+
+    Inst_VOP3__V_BFE_U32::~Inst_VOP3__V_BFE_U32()
+    {
+    } // ~Inst_VOP3__V_BFE_U32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u>>S1.u[4:0]) & ((1<<S2.u[4:0])-1).
+    // Bitfield extract with S0 = data, S1 = field_offset, S2 = field_width.
+    void
+    Inst_VOP3__V_BFE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (src0[lane] >> bits(src1[lane], 4, 0))
+                    & ((1 << bits(src2[lane], 4, 0)) - 1);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_BFE_I32 class methods ---
+
+    Inst_VOP3__V_BFE_I32::Inst_VOP3__V_BFE_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_bfe_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_BFE_I32
+
+    Inst_VOP3__V_BFE_I32::~Inst_VOP3__V_BFE_I32()
+    {
+    } // ~Inst_VOP3__V_BFE_I32
+
+    // --- description from .arch file ---
+    // D.i = (S0.i>>S1.u[4:0]) & ((1<<S2.u[4:0])-1).
+    // Bitfield extract with S0 = data, S1 = field_offset, S2 = field_width.
+    void
+    Inst_VOP3__V_BFE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (src0[lane] >> bits(src1[lane], 4, 0))
+                    & ((1 << bits(src2[lane], 4, 0)) - 1);
+
+                // Above extracted a signed int of size src2 bits which needs
+                // to be signed-extended. Check if the MSB of our src2-bit
+                // integer is 1, and sign extend it is.
+                if (vdst[lane] >> (bits(src2[lane], 4, 0) - 1)) {
+                    vdst[lane] |= 0xffffffff << bits(src2[lane], 4, 0);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_BFI_B32 class methods ---
+
+    Inst_VOP3__V_BFI_B32::Inst_VOP3__V_BFI_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_bfi_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_BFI_B32
+
+    Inst_VOP3__V_BFI_B32::~Inst_VOP3__V_BFI_B32()
+    {
+    } // ~Inst_VOP3__V_BFI_B32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u & S1.u) | (~S0.u & S2.u); bitfield insert.
+    void
+    Inst_VOP3__V_BFI_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (src0[lane] & src1[lane]) | (~src0[lane]
+                    & src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FMA_F32 class methods ---
+
+    Inst_VOP3__V_FMA_F32::Inst_VOP3__V_FMA_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_fma_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(FMA);
+    } // Inst_VOP3__V_FMA_F32
+
+    Inst_VOP3__V_FMA_F32::~Inst_VOP3__V_FMA_F32()
+    {
+    } // ~Inst_VOP3__V_FMA_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f + S2.f.
+    void
+    Inst_VOP3__V_FMA_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FMA_F64 class methods ---
+
+    Inst_VOP3__V_FMA_F64::Inst_VOP3__V_FMA_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_fma_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(FMA);
+    } // Inst_VOP3__V_FMA_F64
+
+    Inst_VOP3__V_FMA_F64::~Inst_VOP3__V_FMA_F64()
+    {
+    } // ~Inst_VOP3__V_FMA_F64
+
+    // --- description from .arch file ---
+    // D.d = S0.d * S1.d + S2.d.
+    void
+    Inst_VOP3__V_FMA_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LERP_U8 class methods ---
+
+    Inst_VOP3__V_LERP_U8::Inst_VOP3__V_LERP_U8(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lerp_u8", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LERP_U8
+
+    Inst_VOP3__V_LERP_U8::~Inst_VOP3__V_LERP_U8()
+    {
+    } // ~Inst_VOP3__V_LERP_U8
+
+    // --- description from .arch file ---
+    // D.u = ((S0.u[31:24] + S1.u[31:24] + S2.u[24]) >> 1) << 24
+    // D.u += ((S0.u[23:16] + S1.u[23:16] + S2.u[16]) >> 1) << 16;
+    // D.u += ((S0.u[15:8] + S1.u[15:8] + S2.u[8]) >> 1) << 8;
+    // D.u += ((S0.u[7:0] + S1.u[7:0] + S2.u[0]) >> 1).
+    // Unsigned 8-bit pixel average on packed unsigned bytes (linear
+    // ---  interpolation). S2 acts as a round mode; if set, 0.5 rounds up,
+    // ---  otherwise 0.5 truncates.
+    void
+    Inst_VOP3__V_LERP_U8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = ((bits(src0[lane], 31, 24)
+                    + bits(src1[lane], 31, 24) + bits(src2[lane], 24)) >> 1)
+                        << 24;
+                vdst[lane] += ((bits(src0[lane], 23, 16)
+                    + bits(src1[lane], 23, 16) + bits(src2[lane], 16)) >> 1)
+                        << 16;
+                vdst[lane] += ((bits(src0[lane], 15, 8)
+                    + bits(src1[lane], 15, 8) + bits(src2[lane], 8)) >> 1)
+                        << 8;
+                vdst[lane] += ((bits(src0[lane], 7, 0) + bits(src1[lane], 7, 0)
+                    + bits(src2[lane], 0)) >> 1);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ALIGNBIT_B32 class methods ---
+
+    Inst_VOP3__V_ALIGNBIT_B32::Inst_VOP3__V_ALIGNBIT_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_alignbit_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_ALIGNBIT_B32
+
+    Inst_VOP3__V_ALIGNBIT_B32::~Inst_VOP3__V_ALIGNBIT_B32()
+    {
+    } // ~Inst_VOP3__V_ALIGNBIT_B32
+
+    // --- description from .arch file ---
+    // D.u = ({S0,S1} >> S2.u[4:0]) & 0xffffffff.
+    void
+    Inst_VOP3__V_ALIGNBIT_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemU64 src_0_1 = (((VecElemU64)src0[lane] << 32)
+                    | (VecElemU64)src1[lane]);
+                vdst[lane] = (VecElemU32)((src_0_1
+                    >> (VecElemU64)bits(src2[lane], 4, 0)) & 0xffffffff);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ALIGNBYTE_B32 class methods ---
+
+    Inst_VOP3__V_ALIGNBYTE_B32::Inst_VOP3__V_ALIGNBYTE_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_alignbyte_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_ALIGNBYTE_B32
+
+    Inst_VOP3__V_ALIGNBYTE_B32::~Inst_VOP3__V_ALIGNBYTE_B32()
+    {
+    } // ~Inst_VOP3__V_ALIGNBYTE_B32
+
+    // --- description from .arch file ---
+    // D.u = ({S0,S1} >> (8*S2.u[4:0])) & 0xffffffff.
+    void
+    Inst_VOP3__V_ALIGNBYTE_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemU64 src_0_1 = (((VecElemU64)src0[lane] << 32)
+                    | (VecElemU64)src1[lane]);
+                vdst[lane] = (VecElemU32)((src_0_1
+                    >> (8ULL * (VecElemU64)bits(src2[lane], 4, 0)))
+                        & 0xffffffff);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MIN3_F32 class methods ---
+
+    Inst_VOP3__V_MIN3_F32::Inst_VOP3__V_MIN3_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min3_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_MIN3_F32
+
+    Inst_VOP3__V_MIN3_F32::~Inst_VOP3__V_MIN3_F32()
+    {
+    } // ~Inst_VOP3__V_MIN3_F32
+
+    // --- description from .arch file ---
+    // D.f = min(S0.f, S1.f, S2.f).
+    void
+    Inst_VOP3__V_MIN3_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemF32 min_0_1 = std::fmin(src0[lane], src1[lane]);
+                vdst[lane] = std::fmin(min_0_1, src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MIN3_I32 class methods ---
+
+    Inst_VOP3__V_MIN3_I32::Inst_VOP3__V_MIN3_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min3_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MIN3_I32
+
+    Inst_VOP3__V_MIN3_I32::~Inst_VOP3__V_MIN3_I32()
+    {
+    } // ~Inst_VOP3__V_MIN3_I32
+
+    // --- description from .arch file ---
+    // D.i = min(S0.i, S1.i, S2.i).
+    void
+    Inst_VOP3__V_MIN3_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemI32 min_0_1 = std::min(src0[lane], src1[lane]);
+                vdst[lane] = std::min(min_0_1, src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MIN3_U32 class methods ---
+
+    Inst_VOP3__V_MIN3_U32::Inst_VOP3__V_MIN3_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min3_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MIN3_U32
+
+    Inst_VOP3__V_MIN3_U32::~Inst_VOP3__V_MIN3_U32()
+    {
+    } // ~Inst_VOP3__V_MIN3_U32
+
+    // --- description from .arch file ---
+    // D.u = min(S0.u, S1.u, S2.u).
+    void
+    Inst_VOP3__V_MIN3_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemU32 min_0_1 = std::min(src0[lane], src1[lane]);
+                vdst[lane] = std::min(min_0_1, src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAX3_F32 class methods ---
+
+    Inst_VOP3__V_MAX3_F32::Inst_VOP3__V_MAX3_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max3_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_MAX3_F32
+
+    Inst_VOP3__V_MAX3_F32::~Inst_VOP3__V_MAX3_F32()
+    {
+    } // ~Inst_VOP3__V_MAX3_F32
+
+    // --- description from .arch file ---
+    // D.f = max(S0.f, S1.f, S2.f).
+    void
+    Inst_VOP3__V_MAX3_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemF32 max_0_1 = std::fmax(src0[lane], src1[lane]);
+                vdst[lane] = std::fmax(max_0_1, src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAX3_I32 class methods ---
+
+    Inst_VOP3__V_MAX3_I32::Inst_VOP3__V_MAX3_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max3_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MAX3_I32
+
+    Inst_VOP3__V_MAX3_I32::~Inst_VOP3__V_MAX3_I32()
+    {
+    } // ~Inst_VOP3__V_MAX3_I32
+
+    // --- description from .arch file ---
+    // D.i = max(S0.i, S1.i, S2.i).
+    void
+    Inst_VOP3__V_MAX3_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemI32 max_0_1 = std::max(src0[lane], src1[lane]);
+                vdst[lane] = std::max(max_0_1, src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAX3_U32 class methods ---
+
+    Inst_VOP3__V_MAX3_U32::Inst_VOP3__V_MAX3_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max3_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MAX3_U32
+
+    Inst_VOP3__V_MAX3_U32::~Inst_VOP3__V_MAX3_U32()
+    {
+    } // ~Inst_VOP3__V_MAX3_U32
+
+    // --- description from .arch file ---
+    // D.u = max(S0.u, S1.u, S2.u).
+    void
+    Inst_VOP3__V_MAX3_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemU32 max_0_1 = std::max(src0[lane], src1[lane]);
+                vdst[lane] = std::max(max_0_1, src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MED3_F32 class methods ---
+
+    Inst_VOP3__V_MED3_F32::Inst_VOP3__V_MED3_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_med3_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_MED3_F32
+
+    Inst_VOP3__V_MED3_F32::~Inst_VOP3__V_MED3_F32()
+    {
+    } // ~Inst_VOP3__V_MED3_F32
+
+    // --- description from .arch file ---
+    // D.f = median(S0.f, S1.f, S2.f).
+    void
+    Inst_VOP3__V_MED3_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MED3_I32 class methods ---
+
+    Inst_VOP3__V_MED3_I32::Inst_VOP3__V_MED3_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_med3_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MED3_I32
+
+    Inst_VOP3__V_MED3_I32::~Inst_VOP3__V_MED3_I32()
+    {
+    } // ~Inst_VOP3__V_MED3_I32
+
+    // --- description from .arch file ---
+    // D.i = median(S0.i, S1.i, S2.i).
+    void
+    Inst_VOP3__V_MED3_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MED3_U32 class methods ---
+
+    Inst_VOP3__V_MED3_U32::Inst_VOP3__V_MED3_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_med3_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MED3_U32
+
+    Inst_VOP3__V_MED3_U32::~Inst_VOP3__V_MED3_U32()
+    {
+    } // ~Inst_VOP3__V_MED3_U32
+
+    // --- description from .arch file ---
+    // D.u = median(S0.u, S1.u, S2.u).
+    void
+    Inst_VOP3__V_MED3_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SAD_U8 class methods ---
+
+    Inst_VOP3__V_SAD_U8::Inst_VOP3__V_SAD_U8(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sad_u8", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_SAD_U8
+
+    Inst_VOP3__V_SAD_U8::~Inst_VOP3__V_SAD_U8()
+    {
+    } // ~Inst_VOP3__V_SAD_U8
+
+    // --- description from .arch file ---
+    // D.u = abs(S0.i[31:24] - S1.i[31:24]) + abs(S0.i[23:16] - S1.i[23:16]) +
+    // abs(S0.i[15:8] - S1.i[15:8]) + abs(S0.i[7:0] - S1.i[7:0]) + S2.u.
+    // Sum of absolute differences with accumulation, overflow into upper bits
+    // is allowed.
+    void
+    Inst_VOP3__V_SAD_U8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::abs(bits(src0[lane], 31, 24)
+                    - bits(src1[lane], 31, 24))
+                    + std::abs(bits(src0[lane], 23, 16)
+                    - bits(src1[lane], 23, 16))
+                    + std::abs(bits(src0[lane], 15, 8)
+                    - bits(src1[lane], 15, 8))
+                    + std::abs(bits(src0[lane], 7, 0)
+                    - bits(src1[lane], 7, 0)) + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SAD_HI_U8 class methods ---
+
+    Inst_VOP3__V_SAD_HI_U8::Inst_VOP3__V_SAD_HI_U8(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sad_hi_u8", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_SAD_HI_U8
+
+    Inst_VOP3__V_SAD_HI_U8::~Inst_VOP3__V_SAD_HI_U8()
+    {
+    } // ~Inst_VOP3__V_SAD_HI_U8
+
+    // --- description from .arch file ---
+    // D.u = (SAD_U8(S0, S1, 0) << 16) + S2.u.
+    // Sum of absolute differences with accumulation, overflow is lost.
+    void
+    Inst_VOP3__V_SAD_HI_U8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (((bits(src0[lane], 31, 24)
+                    - bits(src1[lane], 31, 24)) + (bits(src0[lane], 23, 16)
+                    - bits(src1[lane], 23, 16)) + (bits(src0[lane], 15, 8)
+                    - bits(src1[lane], 15, 8)) + (bits(src0[lane], 7, 0)
+                    - bits(src1[lane], 7, 0))) << 16) + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SAD_U16 class methods ---
+
+    Inst_VOP3__V_SAD_U16::Inst_VOP3__V_SAD_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sad_u16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_SAD_U16
+
+    Inst_VOP3__V_SAD_U16::~Inst_VOP3__V_SAD_U16()
+    {
+    } // ~Inst_VOP3__V_SAD_U16
+
+    // --- description from .arch file ---
+    // D.u = abs(S0.i[31:16] - S1.i[31:16]) + abs(S0.i[15:0] - S1.i[15:0])
+    // + S2.u.
+    // Word SAD with accumulation.
+    void
+    Inst_VOP3__V_SAD_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::abs(bits(src0[lane], 31, 16)
+                    - bits(src1[lane], 31, 16))
+                    + std::abs(bits(src0[lane], 15, 0)
+                    - bits(src1[lane], 15, 0)) + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SAD_U32 class methods ---
+
+    Inst_VOP3__V_SAD_U32::Inst_VOP3__V_SAD_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sad_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_SAD_U32
+
+    Inst_VOP3__V_SAD_U32::~Inst_VOP3__V_SAD_U32()
+    {
+    } // ~Inst_VOP3__V_SAD_U32
+
+    // --- description from .arch file ---
+    // D.u = abs(S0.i - S1.i) + S2.u.
+    // Dword SAD with accumulation.
+    void
+    Inst_VOP3__V_SAD_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::abs(src0[lane] - src1[lane]) + src2[lane];
+            } // if
+        } // for
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_PK_U8_F32 class methods ---
+
+    Inst_VOP3__V_CVT_PK_U8_F32::Inst_VOP3__V_CVT_PK_U8_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_pk_u8_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_PK_U8_F32
+
+    Inst_VOP3__V_CVT_PK_U8_F32::~Inst_VOP3__V_CVT_PK_U8_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_PK_U8_F32
+
+    // --- description from .arch file ---
+    // D.u = ((flt32_to_uint8(S0.f) & 0xff) << (8 * S1.u[1:0]))
+    // | (S2.u & ~(0xff << (8 * S1.u[1:0]))).
+    // Convert floating point value S0 to 8-bit unsigned integer and pack the
+    // result into byte S1 of dword S2.
+    void
+    Inst_VOP3__V_CVT_PK_U8_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (((VecElemU8)src0[lane] & 0xff)
+                    << (8 * bits(src1[lane], 1, 0)))
+                    | (src2[lane] & ~(0xff << (8 * bits(src1[lane], 1, 0))));
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_DIV_FIXUP_F32 class methods ---
+
+    Inst_VOP3__V_DIV_FIXUP_F32::Inst_VOP3__V_DIV_FIXUP_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_div_fixup_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_DIV_FIXUP_F32
+
+    Inst_VOP3__V_DIV_FIXUP_F32::~Inst_VOP3__V_DIV_FIXUP_F32()
+    {
+    } // ~Inst_VOP3__V_DIV_FIXUP_F32
+
+    // --- description from .arch file ---
+    // D.f = Divide fixup and flags -- s0.f = Quotient, s1.f = Denominator,
+    // s2.f = Numerator. This opcode generates exceptions resulting from the
+    // division operation.
+    void
+    Inst_VOP3__V_DIV_FIXUP_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::fpclassify(src1[lane]) == FP_ZERO) {
+                    if (std::signbit(src1[lane])) {
+                        vdst[lane] = -INFINITY;
+                    } else {
+                        vdst[lane] = +INFINITY;
+                    }
+                } else if (std::isnan(src2[lane]) || std::isnan(src1[lane])) {
+                    vdst[lane] = NAN;
+                } else if (std::isinf(src1[lane])) {
+                    if (std::signbit(src1[lane])) {
+                        vdst[lane] = -INFINITY;
+                    } else {
+                        vdst[lane] = +INFINITY;
+                    }
+                } else {
+                    vdst[lane] = src2[lane] / src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_DIV_FIXUP_F64 class methods ---
+
+    Inst_VOP3__V_DIV_FIXUP_F64::Inst_VOP3__V_DIV_FIXUP_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_div_fixup_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_DIV_FIXUP_F64
+
+    Inst_VOP3__V_DIV_FIXUP_F64::~Inst_VOP3__V_DIV_FIXUP_F64()
+    {
+    } // ~Inst_VOP3__V_DIV_FIXUP_F64
+
+    // --- description from .arch file ---
+    // D.d = Divide fixup and flags -- s0.d = Quotient, s1.d = Denominator,
+    // s2.d = Numerator. This opcode generates exceptions resulting from the
+    // division operation.
+    void
+    Inst_VOP3__V_DIV_FIXUP_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int sign_out = std::signbit(src1[lane])
+                              ^ std::signbit(src2[lane]);
+                int exp1(0);
+                int exp2(0);
+                std::frexp(src1[lane], &exp1);
+                std::frexp(src2[lane], &exp2);
+
+                if (std::isnan(src1[lane]) || std::isnan(src2[lane])) {
+                    vdst[lane] = std::numeric_limits<VecElemF64>::quiet_NaN();
+                } else if (std::fpclassify(src1[lane]) == FP_ZERO
+                           && std::fpclassify(src2[lane]) == FP_ZERO) {
+                    vdst[lane]
+                        = std::numeric_limits<VecElemF64>::signaling_NaN();
+                } else if (std::isinf(src1[lane]) && std::isinf(src2[lane])) {
+                    vdst[lane]
+                        = std::numeric_limits<VecElemF64>::signaling_NaN();
+                } else if (std::fpclassify(src1[lane]) == FP_ZERO
+                           || std::isinf(src2[lane])) {
+                    vdst[lane] = sign_out ? -INFINITY : +INFINITY;
+                } else if (std::isinf(src1[lane])
+                           || std::fpclassify(src2[lane]) == FP_ZERO) {
+                    vdst[lane] = sign_out ? -0.0 : +0.0;
+                } else if (exp2 - exp1 < -1075) {
+                    vdst[lane] = src0[lane];
+                } else if (exp1 == 2047) {
+                    vdst[lane] = src0[lane];
+                } else {
+                    vdst[lane] = sign_out ? -std::fabs(src0[lane])
+                        : std::fabs(src0[lane]);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_DIV_SCALE_F32 class methods ---
+
+    Inst_VOP3__V_DIV_SCALE_F32::Inst_VOP3__V_DIV_SCALE_F32(
+          InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_div_scale_f32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(F32);
+    } // Inst_VOP3__V_DIV_SCALE_F32
+
+    Inst_VOP3__V_DIV_SCALE_F32::~Inst_VOP3__V_DIV_SCALE_F32()
+    {
+    } // ~Inst_VOP3__V_DIV_SCALE_F32
+
+    // --- description from .arch file ---
+    // {vcc,D.f} = Divide preop and flags -- s0.f = Quotient, s1.f =
+    // Denominator, s2.f = Numerator -- s0 must equal s1 or s2. Given a
+    // numerator and denominator, this opcode will appropriately scale inputs
+    // for division to avoid subnormal terms during Newton-Raphson correction
+    // algorithm. This opcode producses a VCC flag for post-scale of quotient.
+    void
+    Inst_VOP3__V_DIV_SCALE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane];
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_DIV_SCALE_F64 class methods ---
+
+    Inst_VOP3__V_DIV_SCALE_F64::Inst_VOP3__V_DIV_SCALE_F64(
+          InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_div_scale_f64")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(F64);
+    } // Inst_VOP3__V_DIV_SCALE_F64
+
+    Inst_VOP3__V_DIV_SCALE_F64::~Inst_VOP3__V_DIV_SCALE_F64()
+    {
+    } // ~Inst_VOP3__V_DIV_SCALE_F64
+
+    // --- description from .arch file ---
+    // {vcc,D.d} = Divide preop and flags -- s0.d = Quotient, s1.d =
+    // Denominator, s2.d = Numerator -- s0 must equal s1 or s2. Given a
+    // numerator and denominator, this opcode will appropriately scale inputs
+    // for division to avoid subnormal terms during Newton-Raphson correction
+    // algorithm. This opcode producses a VCC flag for post-scale of quotient.
+    void
+    Inst_VOP3__V_DIV_SCALE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
+        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int exp1(0);
+                int exp2(0);
+                std::frexp(src1[lane], &exp1);
+                std::frexp(src2[lane], &exp2);
+                vcc.setBit(lane, 0);
+
+                if (std::fpclassify(src1[lane]) == FP_ZERO
+                    || std::fpclassify(src2[lane]) == FP_ZERO) {
+                    vdst[lane] = NAN;
+                } else if (exp2 - exp1 >= 768) {
+                    vcc.setBit(lane, 1);
+                    if (src0[lane] == src1[lane]) {
+                        vdst[lane] = std::ldexp(src0[lane], 128);
+                    }
+                } else if (std::fpclassify(src1[lane]) == FP_SUBNORMAL) {
+                    vdst[lane] = std::ldexp(src0[lane], 128);
+                } else if (std::fpclassify(1.0 / src1[lane]) == FP_SUBNORMAL
+                           && std::fpclassify(src2[lane] / src1[lane])
+                           == FP_SUBNORMAL) {
+                    vcc.setBit(lane, 1);
+                    if (src0[lane] == src1[lane]) {
+                        vdst[lane] = std::ldexp(src0[lane], 128);
+                    }
+                } else if (std::fpclassify(1.0 / src1[lane]) == FP_SUBNORMAL) {
+                    vdst[lane] = std::ldexp(src0[lane], -128);
+                } else if (std::fpclassify(src2[lane] / src1[lane])
+                           == FP_SUBNORMAL) {
+                    vcc.setBit(lane, 1);
+                    if (src0[lane] == src2[lane]) {
+                        vdst[lane] = std::ldexp(src0[lane], 128);
+                    }
+                } else if (exp2 <= 53) {
+                    vdst[lane] = std::ldexp(src0[lane], 128);
+                }
+            }
+        }
+
+        vcc.write();
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_DIV_FMAS_F32 class methods ---
+
+    Inst_VOP3__V_DIV_FMAS_F32::Inst_VOP3__V_DIV_FMAS_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_div_fmas_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(ReadsVCC);
+        setFlag(F32);
+        setFlag(FMA);
+    } // Inst_VOP3__V_DIV_FMAS_F32
+
+    Inst_VOP3__V_DIV_FMAS_F32::~Inst_VOP3__V_DIV_FMAS_F32()
+    {
+    } // ~Inst_VOP3__V_DIV_FMAS_F32
+
+    // --- description from .arch file ---
+    // D.f = Special case divide FMA with scale and flags(s0.f = Quotient,
+    // s1.f = Denominator, s2.f = Numerator)
+    void
+    Inst_VOP3__V_DIV_FMAS_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
+            }
+        }
+
+        //vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_DIV_FMAS_F64 class methods ---
+
+    Inst_VOP3__V_DIV_FMAS_F64::Inst_VOP3__V_DIV_FMAS_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_div_fmas_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(ReadsVCC);
+        setFlag(F64);
+        setFlag(FMA);
+    } // Inst_VOP3__V_DIV_FMAS_F64
+
+    Inst_VOP3__V_DIV_FMAS_F64::~Inst_VOP3__V_DIV_FMAS_F64()
+    {
+    } // ~Inst_VOP3__V_DIV_FMAS_F64
+
+    // --- description from .arch file ---
+    // D.d = Special case divide FMA with scale and flags(s0.d = Quotient,
+    // s1.d = Denominator, s2.d = Numerator)
+    void
+    Inst_VOP3__V_DIV_FMAS_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+        vcc.read();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (bits(vcc.rawData(), lane)) {
+                    vdst[lane] = std::pow(2, 64)
+                        * std::fma(src0[lane], src1[lane], src2[lane]);
+                } else {
+                    vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MSAD_U8 class methods ---
+
+    Inst_VOP3__V_MSAD_U8::Inst_VOP3__V_MSAD_U8(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_msad_u8", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MSAD_U8
+
+    Inst_VOP3__V_MSAD_U8::~Inst_VOP3__V_MSAD_U8()
+    {
+    } // ~Inst_VOP3__V_MSAD_U8
+
+    // --- description from .arch file ---
+    // D.u = Masked Byte SAD with accum_lo(S0.u, S1.u, S2.u).
+    void
+    Inst_VOP3__V_MSAD_U8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_QSAD_PK_U16_U8 class methods ---
+
+    Inst_VOP3__V_QSAD_PK_U16_U8::Inst_VOP3__V_QSAD_PK_U16_U8(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_qsad_pk_u16_u8", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_QSAD_PK_U16_U8
+
+    Inst_VOP3__V_QSAD_PK_U16_U8::~Inst_VOP3__V_QSAD_PK_U16_U8()
+    {
+    } // ~Inst_VOP3__V_QSAD_PK_U16_U8
+
+    // --- description from .arch file ---
+    // D.u = Quad-Byte SAD with 16-bit packed accum_lo/hi(S0.u[63:0],
+    // S1.u[31:0], S2.u[63:0])
+    void
+    Inst_VOP3__V_QSAD_PK_U16_U8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_MQSAD_PK_U16_U8 class methods ---
+
+    Inst_VOP3__V_MQSAD_PK_U16_U8::Inst_VOP3__V_MQSAD_PK_U16_U8(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mqsad_pk_u16_u8", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MQSAD_PK_U16_U8
+
+    Inst_VOP3__V_MQSAD_PK_U16_U8::~Inst_VOP3__V_MQSAD_PK_U16_U8()
+    {
+    } // ~Inst_VOP3__V_MQSAD_PK_U16_U8
+
+    // --- description from .arch file ---
+    // D.u = Masked Quad-Byte SAD with 16-bit packed accum_lo/hi(S0.u[63:0],
+    // ---  S1.u[31:0], S2.u[63:0])
+    void
+    Inst_VOP3__V_MQSAD_PK_U16_U8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_MQSAD_U32_U8 class methods ---
+
+    Inst_VOP3__V_MQSAD_U32_U8::Inst_VOP3__V_MQSAD_U32_U8(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mqsad_u32_u8", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MQSAD_U32_U8
+
+    Inst_VOP3__V_MQSAD_U32_U8::~Inst_VOP3__V_MQSAD_U32_U8()
+    {
+    } // ~Inst_VOP3__V_MQSAD_U32_U8
+
+    // --- description from .arch file ---
+    // D.u128 = Masked Quad-Byte SAD with 32-bit accum_lo/hi(S0.u[63:0],
+    // ---  S1.u[31:0], S2.u[127:0])
+    void
+    Inst_VOP3__V_MQSAD_U32_U8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_MAD_U64_U32 class methods ---
+
+    Inst_VOP3__V_MAD_U64_U32::Inst_VOP3__V_MAD_U64_U32(
+          InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_mad_u64_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(MAD);
+    } // Inst_VOP3__V_MAD_U64_U32
+
+    Inst_VOP3__V_MAD_U64_U32::~Inst_VOP3__V_MAD_U64_U32()
+    {
+    } // ~Inst_VOP3__V_MAD_U64_U32
+
+    // --- description from .arch file ---
+    // {vcc_out,D.u64} = S0.u32 * S1.u32 + S2.u64.
+    void
+    Inst_VOP3__V_MAD_U64_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
+        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
+        VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+        vdst.read();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, muladd(vdst[lane], src0[lane], src1[lane],
+                    src2[lane]));
+            }
+        }
+
+        vcc.write();
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAD_I64_I32 class methods ---
+
+    Inst_VOP3__V_MAD_I64_I32::Inst_VOP3__V_MAD_I64_I32(
+          InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_mad_i64_i32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(MAD);
+    } // Inst_VOP3__V_MAD_I64_I32
+
+    Inst_VOP3__V_MAD_I64_I32::~Inst_VOP3__V_MAD_I64_I32()
+    {
+    } // ~Inst_VOP3__V_MAD_I64_I32
+
+    // --- description from .arch file ---
+    // {vcc_out,D.i64} = S0.i32 * S1.i32 + S2.i64.
+    void
+    Inst_VOP3__V_MAD_I64_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandI64 src2(gpuDynInst, extData.SRC2);
+        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
+        VecOperandI64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, muladd(vdst[lane], src0[lane], src1[lane],
+                    src2[lane]));
+            }
+        }
+
+        vcc.write();
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_XAD_U32 class methods ---
+
+    Inst_VOP3__V_XAD_U32::Inst_VOP3__V_XAD_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_xad_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_XAD_U32
+
+    Inst_VOP3__V_XAD_U32::~Inst_VOP3__V_XAD_U32()
+    {
+    } // ~Inst_VOP3__V_XAD_U32
+
+    // --- description from .arch file ---
+    // D.u32 = (S0.u32 ^ S1.u32) + S2.u32.
+    void
+    Inst_VOP3__V_XAD_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (src0[lane] ^ src1[lane]) + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LSHL_ADD_U32 class methods ---
+
+    Inst_VOP3__V_LSHL_ADD_U32::Inst_VOP3__V_LSHL_ADD_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lshl_add_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LSHL_ADD_U32
+
+    Inst_VOP3__V_LSHL_ADD_U32::~Inst_VOP3__V_LSHL_ADD_U32()
+    {
+    } // ~Inst_VOP3__V_LSHL_ADD_U32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u << S1.u[4:0]) + S2.u.
+    void
+    Inst_VOP3__V_LSHL_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (src0[lane] << bits(src1[lane], 4, 0))
+                           + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ADD_LSHL_U32 class methods ---
+
+    Inst_VOP3__V_ADD_LSHL_U32::Inst_VOP3__V_ADD_LSHL_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_add_lshl_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_ADD_LSHL_U32
+
+    Inst_VOP3__V_ADD_LSHL_U32::~Inst_VOP3__V_ADD_LSHL_U32()
+    {
+    } // ~Inst_VOP3__V_ADD_LSHL_U32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u + S1.u) << S2.u[4:0].
+    void
+    Inst_VOP3__V_ADD_LSHL_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] =
+                    (src0[lane] + src1[lane]) << bits(src2[lane], 4, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ADD3_U32 class methods ---
+
+    Inst_VOP3__V_ADD3_U32::Inst_VOP3__V_ADD3_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_add3_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_ADD3_U32
+
+    Inst_VOP3__V_ADD3_U32::~Inst_VOP3__V_ADD3_U32()
+    {
+    } // ~Inst_VOP3__V_ADD3_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u + S1.u + S2.u.
+    void
+    Inst_VOP3__V_ADD3_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] + src1[lane] + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LSHL_OR_B32 class methods ---
+
+    Inst_VOP3__V_LSHL_OR_B32::Inst_VOP3__V_LSHL_OR_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lshl_or_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LSHL_OR_B32
+
+    Inst_VOP3__V_LSHL_OR_B32::~Inst_VOP3__V_LSHL_OR_B32()
+    {
+    } // ~Inst_VOP3__V_LSHL_OR_B32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u << S1.u[4:0]) | S2.u.
+    void
+    Inst_VOP3__V_LSHL_OR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (src0[lane] << bits(src1[lane], 4, 0))
+                           | src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_AND_OR_B32 class methods ---
+
+    Inst_VOP3__V_AND_OR_B32::Inst_VOP3__V_AND_OR_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_and_or_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_AND_OR_B32
+
+    Inst_VOP3__V_AND_OR_B32::~Inst_VOP3__V_AND_OR_B32()
+    {
+    } // ~Inst_VOP3__V_AND_OR_B32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u & S1.u) | S2.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP3__V_AND_OR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (src0[lane] & src1[lane]) | src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAD_F16 class methods ---
+
+    Inst_VOP3__V_MAD_F16::Inst_VOP3__V_MAD_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mad_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(MAD);
+    } // Inst_VOP3__V_MAD_F16
+
+    Inst_VOP3__V_MAD_F16::~Inst_VOP3__V_MAD_F16()
+    {
+    } // ~Inst_VOP3__V_MAD_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * S1.f16 + S2.f16.
+    // Supports round mode, exception flags, saturation.
+    void
+    Inst_VOP3__V_MAD_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_MAD_U16 class methods ---
+
+    Inst_VOP3__V_MAD_U16::Inst_VOP3__V_MAD_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mad_u16", false)
+    {
+        setFlag(ALU);
+        setFlag(MAD);
+    } // Inst_VOP3__V_MAD_U16
+
+    Inst_VOP3__V_MAD_U16::~Inst_VOP3__V_MAD_U16()
+    {
+    } // ~Inst_VOP3__V_MAD_U16
+
+    // --- description from .arch file ---
+    // D.u16 = S0.u16 * S1.u16 + S2.u16.
+    // Supports saturation (unsigned 16-bit integer domain).
+    void
+    Inst_VOP3__V_MAD_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU16 src2(gpuDynInst, extData.SRC2);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] * src1[lane] + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAD_I16 class methods ---
+
+    Inst_VOP3__V_MAD_I16::Inst_VOP3__V_MAD_I16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mad_i16", false)
+    {
+        setFlag(ALU);
+        setFlag(MAD);
+    } // Inst_VOP3__V_MAD_I16
+
+    Inst_VOP3__V_MAD_I16::~Inst_VOP3__V_MAD_I16()
+    {
+    } // ~Inst_VOP3__V_MAD_I16
+
+    // --- description from .arch file ---
+    // D.i16 = S0.i16 * S1.i16 + S2.i16.
+    // Supports saturation (signed 16-bit integer domain).
+    void
+    Inst_VOP3__V_MAD_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandI16 src2(gpuDynInst, extData.SRC2);
+        VecOperandI16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] * src1[lane] + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_PERM_B32 class methods ---
+
+    Inst_VOP3__V_PERM_B32::Inst_VOP3__V_PERM_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_perm_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_PERM_B32
+
+    Inst_VOP3__V_PERM_B32::~Inst_VOP3__V_PERM_B32()
+    {
+    } // ~Inst_VOP3__V_PERM_B32
+
+    // --- description from .arch file ---
+    // D.u[31:24] = permute({S0.u, S1.u}, S2.u[31:24]);
+    // D.u[23:16] = permute({S0.u, S1.u}, S2.u[23:16]);
+    // D.u[15:8] = permute({S0.u, S1.u}, S2.u[15:8]);
+    // D.u[7:0] = permute({S0.u, S1.u}, S2.u[7:0]);
+    // byte permute(byte in[8], byte sel) {
+    //     if (sel>=13) then return 0xff;
+    //     elsif(sel==12) then return 0x00;
+    //     elsif(sel==11) then return in[7][7] * 0xff;
+    //     elsif(sel==10) then return in[5][7] * 0xff;
+    //     elsif(sel==9) then return in[3][7] * 0xff;
+    //     elsif(sel==8) then return in[1][7] * 0xff;
+    //     else return in[sel];
+    //     }
+    // Byte permute.
+    void
+    Inst_VOP3__V_PERM_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemU64 selector = (VecElemU64)src0[lane];
+                selector = (selector << 32) | (VecElemU64)src1[lane];
+                vdst[lane] = 0;
+
+                DPRINTF(VEGA, "Executing v_perm_b32 src_0 0x%08x, src_1 "
+                        "0x%08x, src_2 0x%08x, vdst 0x%08x\n", src0[lane],
+                        src1[lane], src2[lane], vdst[lane]);
+                DPRINTF(VEGA, "Selector: 0x%08x \n", selector);
+
+                for (int i = 0; i < 4 ; ++i) {
+                    VecElemU32 permuted_val = permute(selector, 0xFF
+                        & ((VecElemU32)src2[lane] >> (8 * i)));
+                    vdst[lane] |= (permuted_val << (8 * i));
+                }
+
+                DPRINTF(VEGA, "v_perm result: 0x%08x\n", vdst[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FMA_F16 class methods ---
+
+    Inst_VOP3__V_FMA_F16::Inst_VOP3__V_FMA_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_fma_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(FMA);
+    } // Inst_VOP3__V_FMA_F16
+
+    Inst_VOP3__V_FMA_F16::~Inst_VOP3__V_FMA_F16()
+    {
+    } // ~Inst_VOP3__V_FMA_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * S1.f16 + S2.f16.
+    // Fused half precision multiply add.
+    void
+    Inst_VOP3__V_FMA_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_DIV_FIXUP_F16 class methods ---
+
+    Inst_VOP3__V_DIV_FIXUP_F16::Inst_VOP3__V_DIV_FIXUP_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_div_fixup_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_DIV_FIXUP_F16
+
+    Inst_VOP3__V_DIV_FIXUP_F16::~Inst_VOP3__V_DIV_FIXUP_F16()
+    {
+    } // ~Inst_VOP3__V_DIV_FIXUP_F16
+
+    // --- description from .arch file ---
+    // sign_out =  sign(S1.f16)^sign(S2.f16);
+    // if (S2.f16 == NAN)
+    //     D.f16 = Quiet(S2.f16);
+    // else if (S1.f16 == NAN)
+    //     D.f16 = Quiet(S1.f16);
+    // else if (S1.f16 == S2.f16 == 0)
+    //     # 0/0
+    //     D.f16 = pele_nan(0xfe00);
+    // else if (abs(S1.f16) == abs(S2.f16) == +-INF)
+    //     # inf/inf
+    //     D.f16 = pele_nan(0xfe00);
+    // else if (S1.f16 ==0 || abs(S2.f16) == +-INF)
+    //     # x/0, or inf/y
+    //     D.f16 = sign_out ? -INF : INF;
+    // else if (abs(S1.f16) == +-INF || S2.f16 == 0)
+    //     # x/inf, 0/y
+    //     D.f16 = sign_out ? -0 : 0;
+    // else if ((exp(S2.f16) - exp(S1.f16)) < -150)
+    //     D.f16 = sign_out ? -underflow : underflow;
+    // else if (exp(S1.f16) == 255)
+    //     D.f16 = sign_out ? -overflow : overflow;
+    // else
+    //     D.f16 = sign_out ? -abs(S0.f16) : abs(S0.f16).
+    // Half precision division fixup.
+    // S0 = Quotient, S1 = Denominator, S3 = Numerator.
+    // Given a numerator, denominator, and quotient from a divide, this opcode
+    // will detect and apply special case numerics, touching up the quotient if
+    // necessary. This opcode also generates invalid, denorm and divide by
+    // zero exceptions caused by the division.
+    void
+    Inst_VOP3__V_DIV_FIXUP_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_PKACCUM_U8_F32 class methods ---
+
+    Inst_VOP3__V_CVT_PKACCUM_U8_F32::Inst_VOP3__V_CVT_PKACCUM_U8_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_pkaccum_u8_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_PKACCUM_U8_F32
+
+    Inst_VOP3__V_CVT_PKACCUM_U8_F32::~Inst_VOP3__V_CVT_PKACCUM_U8_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_PKACCUM_U8_F32
+
+    // --- description from .arch file ---
+    // byte = S1.u[1:0]; bit = byte * 8;
+    // D.u[bit+7:bit] = flt32_to_uint8(S0.f);
+    // Pack converted value of S0.f into byte S1 of the destination.
+    // SQ translates to V_CVT_PK_U8_F32.
+    // Note: this opcode uses src_c to pass destination in as a source.
+    void
+    Inst_VOP3__V_CVT_PKACCUM_U8_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_INTERP_P1_F32 class methods ---
+
+    Inst_VOP3__V_INTERP_P1_F32::Inst_VOP3__V_INTERP_P1_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_interp_p1_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_INTERP_P1_F32
+
+    Inst_VOP3__V_INTERP_P1_F32::~Inst_VOP3__V_INTERP_P1_F32()
+    {
+    } // ~Inst_VOP3__V_INTERP_P1_F32
+
+    // --- description from .arch file ---
+    // D.f = P10 * S.f + P0; parameter interpolation (SQ translates to
+    // V_MAD_F32 for SP).
+    // CAUTION: when in HALF_LDS mode, D must not be the same GPR as S; if
+    // D == S then data corruption will occur.
+    // NOTE: In textual representations the I/J VGPR is the first source and
+    // the attribute is the second source; however in the VOP3 encoding the
+    // attribute is stored in the src0 field and the VGPR is stored in the
+    // src1 field.
+    void
+    Inst_VOP3__V_INTERP_P1_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_INTERP_P2_F32 class methods ---
+
+    Inst_VOP3__V_INTERP_P2_F32::Inst_VOP3__V_INTERP_P2_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_interp_p2_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_INTERP_P2_F32
+
+    Inst_VOP3__V_INTERP_P2_F32::~Inst_VOP3__V_INTERP_P2_F32()
+    {
+    } // ~Inst_VOP3__V_INTERP_P2_F32
+
+    // --- description from .arch file ---
+    // D.f = P20 * S.f + D.f; parameter interpolation (SQ translates to
+    // V_MAD_F32 for SP).
+    // NOTE: In textual representations the I/J VGPR is the first source and
+    // the attribute is the second source; however in the VOP3 encoding the
+    // attribute is stored in the src0 field and the VGPR is stored in the
+    // src1 field.
+    void
+    Inst_VOP3__V_INTERP_P2_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_INTERP_MOV_F32 class methods ---
+
+    Inst_VOP3__V_INTERP_MOV_F32::Inst_VOP3__V_INTERP_MOV_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_interp_mov_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_INTERP_MOV_F32
+
+    Inst_VOP3__V_INTERP_MOV_F32::~Inst_VOP3__V_INTERP_MOV_F32()
+    {
+    } // ~Inst_VOP3__V_INTERP_MOV_F32
+
+    // --- description from .arch file ---
+    // D.f = {P10,P20,P0}[S.u]; parameter load.
+    void
+    Inst_VOP3__V_INTERP_MOV_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_INTERP_P1LL_F16 class methods ---
+
+    Inst_VOP3__V_INTERP_P1LL_F16::Inst_VOP3__V_INTERP_P1LL_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_interp_p1ll_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_INTERP_P1LL_F16
+
+    Inst_VOP3__V_INTERP_P1LL_F16::~Inst_VOP3__V_INTERP_P1LL_F16()
+    {
+    } // ~Inst_VOP3__V_INTERP_P1LL_F16
+
+    // --- description from .arch file ---
+    // D.f32 = P10.f16 * S0.f32 + P0.f16.
+    // 'LL' stands for 'two LDS arguments'.
+    // attr_word selects the high or low half 16 bits of each LDS dword
+    // accessed.
+    // This opcode is available for 32-bank LDS only.
+    // NOTE: In textual representations the I/J VGPR is the first source and
+    // the attribute is the second source; however in the VOP3 encoding the
+    // attribute is stored in the src0 field and the VGPR is stored in the
+    // src1 field.
+    void
+    Inst_VOP3__V_INTERP_P1LL_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_INTERP_P1LV_F16 class methods ---
+
+    Inst_VOP3__V_INTERP_P1LV_F16::Inst_VOP3__V_INTERP_P1LV_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_interp_p1lv_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_INTERP_P1LV_F16
+
+    Inst_VOP3__V_INTERP_P1LV_F16::~Inst_VOP3__V_INTERP_P1LV_F16()
+    {
+    } // ~Inst_VOP3__V_INTERP_P1LV_F16
+
+    // --- description from .arch file ---
+    // D.f32 = P10.f16 * S0.f32 + (S2.u32 >> (attr_word * 16)).f16.
+    // 'LV' stands for 'One LDS and one VGPR argument'.
+    // S2 holds two parameters, attr_word selects the high or low word of the
+    // VGPR for this calculation, as well as the high or low half of the LDS
+    // data.
+    // Meant for use with 16-bank LDS.
+    // NOTE: In textual representations the I/J VGPR is the first source and
+    // the attribute is the second source; however in the VOP3 encoding the
+    // attribute is stored in the src0 field and the VGPR is stored in the
+    // src1 field.
+    void
+    Inst_VOP3__V_INTERP_P1LV_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_INTERP_P2_F16 class methods ---
+
+    Inst_VOP3__V_INTERP_P2_F16::Inst_VOP3__V_INTERP_P2_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_interp_p2_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_INTERP_P2_F16
+
+    Inst_VOP3__V_INTERP_P2_F16::~Inst_VOP3__V_INTERP_P2_F16()
+    {
+    } // ~Inst_VOP3__V_INTERP_P2_F16
+
+    // --- description from .arch file ---
+    // D.f16 = P20.f16 * S0.f32 + S2.f32.
+    // Final computation. attr_word selects LDS high or low 16bits. Used for
+    // both 16- and 32-bank LDS.
+    // Result is always written to the 16 LSBs of the destination VGPR.
+    // NOTE: In textual representations the I/J VGPR is the first source and
+    // the attribute is the second source; however in the VOP3 encoding the
+    // attribute is stored in the src0 field and the VGPR is stored in the
+    // src1 field.
+    void
+    Inst_VOP3__V_INTERP_P2_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_ADD_F64 class methods ---
+
+    Inst_VOP3__V_ADD_F64::Inst_VOP3__V_ADD_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_add_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_ADD_F64
+
+    Inst_VOP3__V_ADD_F64::~Inst_VOP3__V_ADD_F64()
+    {
+    } // ~Inst_VOP3__V_ADD_F64
+
+    // --- description from .arch file ---
+    // D.d = S0.d + S1.d.
+    void
+    Inst_VOP3__V_ADD_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isnan(src0[lane]) ||
+                    std::isnan(src1[lane]) ) {
+                        vdst[lane] = NAN;
+                } else if (std::isinf(src0[lane]) &&
+                           std::isinf(src1[lane])) {
+                    if (std::signbit(src0[lane]) !=
+                        std::signbit(src1[lane])) {
+                        vdst[lane] = NAN;
+                    } else {
+                        vdst[lane] = src0[lane];
+                    }
+                } else if (std::isinf(src0[lane])) {
+                    vdst[lane] = src0[lane];
+                } else if (std::isinf(src1[lane])) {
+                    vdst[lane] = src1[lane];
+                } else if (std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src0[lane]) == FP_ZERO) {
+                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src1[lane]) == FP_ZERO) {
+                        if (std::signbit(src0[lane]) &&
+                            std::signbit(src1[lane])) {
+                            vdst[lane] = -0.0;
+                        } else {
+                            vdst[lane] = 0.0;
+                        }
+                    } else {
+                        vdst[lane] = src1[lane];
+                    }
+                } else if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src1[lane]) == FP_ZERO) {
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src0[lane]) == FP_ZERO) {
+                        if (std::signbit(src0[lane]) &&
+                            std::signbit(src1[lane])) {
+                            vdst[lane] = -0.0;
+                        } else {
+                            vdst[lane] = 0.0;
+                        }
+                    } else {
+                        vdst[lane] = src0[lane];
+                    }
+                } else {
+                    vdst[lane] = src0[lane] + src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_F64 class methods ---
+
+    Inst_VOP3__V_MUL_F64::Inst_VOP3__V_MUL_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_MUL_F64
+
+    Inst_VOP3__V_MUL_F64::~Inst_VOP3__V_MUL_F64()
+    {
+    } // ~Inst_VOP3__V_MUL_F64
+
+    // --- description from .arch file ---
+    // D.d = S0.d * S1.d.
+    void
+    Inst_VOP3__V_MUL_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isnan(src0[lane]) ||
+                    std::isnan(src1[lane])) {
+                    vdst[lane] = NAN;
+                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src0[lane]) == FP_ZERO) &&
+                           !std::signbit(src0[lane])) {
+                    if (std::isinf(src1[lane])) {
+                        vdst[lane] = NAN;
+                    } else if (!std::signbit(src1[lane])) {
+                        vdst[lane] = +0.0;
+                    } else {
+                        vdst[lane] = -0.0;
+                    }
+                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src0[lane]) == FP_ZERO) &&
+                           std::signbit(src0[lane])) {
+                    if (std::isinf(src1[lane])) {
+                        vdst[lane] = NAN;
+                    } else if (std::signbit(src1[lane])) {
+                        vdst[lane] = +0.0;
+                    } else {
+                        vdst[lane] = -0.0;
+                    }
+                } else if (std::isinf(src0[lane]) &&
+                           !std::signbit(src0[lane])) {
+                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src1[lane]) == FP_ZERO) {
+                        vdst[lane] = NAN;
+                    } else if (!std::signbit(src1[lane])) {
+                        vdst[lane] = +INFINITY;
+                    } else {
+                        vdst[lane] = -INFINITY;
+                    }
+                } else if (std::isinf(src0[lane]) &&
+                           std::signbit(src0[lane])) {
+                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src1[lane]) == FP_ZERO) {
+                        vdst[lane] = NAN;
+                    } else if (std::signbit(src1[lane])) {
+                        vdst[lane] = +INFINITY;
+                    } else {
+                        vdst[lane] = -INFINITY;
+                    }
+                } else {
+                    vdst[lane] = src0[lane] * src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MIN_F64 class methods ---
+
+    Inst_VOP3__V_MIN_F64::Inst_VOP3__V_MIN_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_MIN_F64
+
+    Inst_VOP3__V_MIN_F64::~Inst_VOP3__V_MIN_F64()
+    {
+    } // ~Inst_VOP3__V_MIN_F64
+
+    // --- description from .arch file ---
+    // D.d = min(S0.d, S1.d).
+    void
+    Inst_VOP3__V_MIN_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fmin(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAX_F64 class methods ---
+
+    Inst_VOP3__V_MAX_F64::Inst_VOP3__V_MAX_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_MAX_F64
+
+    Inst_VOP3__V_MAX_F64::~Inst_VOP3__V_MAX_F64()
+    {
+    } // ~Inst_VOP3__V_MAX_F64
+
+    // --- description from .arch file ---
+    // D.d = max(S0.d, S1.d).
+    void
+    Inst_VOP3__V_MAX_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fmax(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LDEXP_F64 class methods ---
+
+    Inst_VOP3__V_LDEXP_F64::Inst_VOP3__V_LDEXP_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ldexp_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_LDEXP_F64
+
+    Inst_VOP3__V_LDEXP_F64::~Inst_VOP3__V_LDEXP_F64()
+    {
+    } // ~Inst_VOP3__V_LDEXP_F64
+
+    // --- description from .arch file ---
+    // D.d = pow(S0.d, S1.i[31:0]).
+    void
+    Inst_VOP3__V_LDEXP_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isnan(src0[lane]) || std::isinf(src0[lane])) {
+                    vdst[lane] = src0[lane];
+                } else if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                           || std::fpclassify(src0[lane]) == FP_ZERO) {
+                    if (std::signbit(src0[lane])) {
+                        vdst[lane] = -0.0;
+                    } else {
+                        vdst[lane] = +0.0;
+                    }
+                } else {
+                    vdst[lane] = std::ldexp(src0[lane], src1[lane]);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_LO_U32 class methods ---
+
+    Inst_VOP3__V_MUL_LO_U32::Inst_VOP3__V_MUL_LO_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_lo_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MUL_LO_U32
+
+    Inst_VOP3__V_MUL_LO_U32::~Inst_VOP3__V_MUL_LO_U32()
+    {
+    } // ~Inst_VOP3__V_MUL_LO_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u * S1.u.
+    void
+    Inst_VOP3__V_MUL_LO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemI64 s0 = (VecElemI64)src0[lane];
+                VecElemI64 s1 = (VecElemI64)src1[lane];
+                vdst[lane] = (VecElemU32)((s0 * s1) & 0xffffffffLL);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_HI_U32 class methods ---
+
+    Inst_VOP3__V_MUL_HI_U32::Inst_VOP3__V_MUL_HI_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_hi_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MUL_HI_U32
+
+    Inst_VOP3__V_MUL_HI_U32::~Inst_VOP3__V_MUL_HI_U32()
+    {
+    } // ~Inst_VOP3__V_MUL_HI_U32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u * S1.u) >> 32.
+    void
+    Inst_VOP3__V_MUL_HI_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemI64 s0 = (VecElemI64)src0[lane];
+                VecElemI64 s1 = (VecElemI64)src1[lane];
+                vdst[lane]
+                    = (VecElemU32)(((s0 * s1) >> 32) & 0xffffffffLL);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_HI_I32 class methods ---
+
+    Inst_VOP3__V_MUL_HI_I32::Inst_VOP3__V_MUL_HI_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_hi_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MUL_HI_I32
+
+    Inst_VOP3__V_MUL_HI_I32::~Inst_VOP3__V_MUL_HI_I32()
+    {
+    } // ~Inst_VOP3__V_MUL_HI_I32
+
+    // --- description from .arch file ---
+    // D.i = (S0.i * S1.i) >> 32.
+    void
+    Inst_VOP3__V_MUL_HI_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemI64 s0 = (VecElemI64)src0[lane];
+                VecElemI64 s1 = (VecElemI64)src1[lane];
+                vdst[lane]
+                    = (VecElemI32)(((s0 * s1) >> 32LL) & 0xffffffffLL);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LDEXP_F32 class methods ---
+
+    Inst_VOP3__V_LDEXP_F32::Inst_VOP3__V_LDEXP_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ldexp_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_LDEXP_F32
+
+    Inst_VOP3__V_LDEXP_F32::~Inst_VOP3__V_LDEXP_F32()
+    {
+    } // ~Inst_VOP3__V_LDEXP_F32
+
+    // --- description from .arch file ---
+    // D.f = pow(S0.f, S1.i)
+    void
+    Inst_VOP3__V_LDEXP_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::ldexp(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_READLANE_B32 class methods ---
+
+    Inst_VOP3__V_READLANE_B32::Inst_VOP3__V_READLANE_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_readlane_b32", true)
+    {
+        setFlag(ALU);
+        setFlag(IgnoreExec);
+    } // Inst_VOP3__V_READLANE_B32
+
+    Inst_VOP3__V_READLANE_B32::~Inst_VOP3__V_READLANE_B32()
+    {
+    } // ~Inst_VOP3__V_READLANE_B32
+
+    // --- description from .arch file ---
+    // Copy one VGPR value to one SGPR. D = SGPR-dest, S0 = Source Data (VGPR#
+    // or M0(lds-direct)), S1 = Lane Select (SGPR or M0). Ignores exec mask.
+    // Input and output modifiers not supported; this is an untyped operation.
+    void
+    Inst_VOP3__V_READLANE_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        sdst = src0[src1.rawData() & 0x3f];
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_WRITELANE_B32 class methods ---
+
+    Inst_VOP3__V_WRITELANE_B32::Inst_VOP3__V_WRITELANE_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_writelane_b32", false)
+    {
+        setFlag(ALU);
+        setFlag(IgnoreExec);
+    } // Inst_VOP3__V_WRITELANE_B32
+
+    Inst_VOP3__V_WRITELANE_B32::~Inst_VOP3__V_WRITELANE_B32()
+    {
+    } // ~Inst_VOP3__V_WRITELANE_B32
+
+    // --- description from .arch file ---
+    // Write value into one VGPR in one lane. D = VGPR-dest, S0 = Source Data
+    // (sgpr, m0, exec or constants), S1 = Lane Select (SGPR or M0). Ignores
+    // exec mask.
+    // Input and output modifiers not supported; this is an untyped operation.
+    // SQ translates to V_MOV_B32.
+    void
+    Inst_VOP3__V_WRITELANE_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.read();
+        src1.read();
+        vdst.read();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        vdst[src1.rawData() & 0x3f] = src0.rawData();
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_BCNT_U32_B32 class methods ---
+
+    Inst_VOP3__V_BCNT_U32_B32::Inst_VOP3__V_BCNT_U32_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_bcnt_u32_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_BCNT_U32_B32
+
+    Inst_VOP3__V_BCNT_U32_B32::~Inst_VOP3__V_BCNT_U32_B32()
+    {
+    } // ~Inst_VOP3__V_BCNT_U32_B32
+
+    // --- description from .arch file ---
+    // D.u = CountOneBits(S0.u) + S1.u. Bit count.
+    void
+    Inst_VOP3__V_BCNT_U32_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = popCount(src0[lane]) + src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MBCNT_LO_U32_B32 class methods ---
+
+    Inst_VOP3__V_MBCNT_LO_U32_B32::Inst_VOP3__V_MBCNT_LO_U32_B32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mbcnt_lo_u32_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MBCNT_LO_U32_B32
+
+    Inst_VOP3__V_MBCNT_LO_U32_B32::~Inst_VOP3__V_MBCNT_LO_U32_B32()
+    {
+    } // ~Inst_VOP3__V_MBCNT_LO_U32_B32
+
+    // --- description from .arch file ---
+    // ThreadMask = (1 << ThreadPosition) - 1;
+    // D.u = CountOneBits(S0.u & ThreadMask[31:0]) + S1.u.
+    // Masked bit count, ThreadPosition is the position of this thread in the
+    // ---  wavefront (in 0..63).
+    void
+    Inst_VOP3__V_MBCNT_LO_U32_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        uint64_t threadMask = 0;
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                threadMask = ((1LL << lane) - 1LL);
+                vdst[lane] = popCount(src0[lane] & bits(threadMask, 31, 0)) +
+                             src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MBCNT_HI_U32_B32 class methods ---
+
+    Inst_VOP3__V_MBCNT_HI_U32_B32::Inst_VOP3__V_MBCNT_HI_U32_B32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mbcnt_hi_u32_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MBCNT_HI_U32_B32
+
+    Inst_VOP3__V_MBCNT_HI_U32_B32::~Inst_VOP3__V_MBCNT_HI_U32_B32()
+    {
+    } // ~Inst_VOP3__V_MBCNT_HI_U32_B32
+
+    // --- description from .arch file ---
+    // ThreadMask = (1 << ThreadPosition) - 1;
+    // D.u = CountOneBits(S0.u & ThreadMask[63:32]) + S1.u.
+    // Masked bit count, ThreadPosition is the position of this thread in the
+    // ---  wavefront (in 0..63).
+    void
+    Inst_VOP3__V_MBCNT_HI_U32_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        uint64_t threadMask = 0;
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                threadMask = ((1LL << lane) - 1LL);
+                vdst[lane] = popCount(src0[lane] & bits(threadMask, 63, 32)) +
+                             src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LSHLREV_B64 class methods ---
+
+    Inst_VOP3__V_LSHLREV_B64::Inst_VOP3__V_LSHLREV_B64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lshlrev_b64", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LSHLREV_B64
+
+    Inst_VOP3__V_LSHLREV_B64::~Inst_VOP3__V_LSHLREV_B64()
+    {
+    } // ~Inst_VOP3__V_LSHLREV_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S1.u64 << S0.u[5:0].
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP3__V_LSHLREV_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] << bits(src0[lane], 5, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LSHRREV_B64 class methods ---
+
+    Inst_VOP3__V_LSHRREV_B64::Inst_VOP3__V_LSHRREV_B64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lshrrev_b64", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LSHRREV_B64
+
+    Inst_VOP3__V_LSHRREV_B64::~Inst_VOP3__V_LSHRREV_B64()
+    {
+    } // ~Inst_VOP3__V_LSHRREV_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S1.u64 >> S0.u[5:0].
+    // The vacated bits are set to zero.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP3__V_LSHRREV_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] >> bits(src0[lane], 5, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ASHRREV_I64 class methods ---
+
+    Inst_VOP3__V_ASHRREV_I64::Inst_VOP3__V_ASHRREV_I64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ashrrev_i64", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_ASHRREV_I64
+
+    Inst_VOP3__V_ASHRREV_I64::~Inst_VOP3__V_ASHRREV_I64()
+    {
+    } // ~Inst_VOP3__V_ASHRREV_I64
+
+    // --- description from .arch file ---
+    // D.u64 = signext(S1.u64) >> S0.u[5:0].
+    // The vacated bits are set to the sign bit of the input value.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP3__V_ASHRREV_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane]
+                    = src1[lane] >> bits(src0[lane], 5, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_TRIG_PREOP_F64 class methods ---
+
+    Inst_VOP3__V_TRIG_PREOP_F64::Inst_VOP3__V_TRIG_PREOP_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_trig_preop_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_TRIG_PREOP_F64
+
+    Inst_VOP3__V_TRIG_PREOP_F64::~Inst_VOP3__V_TRIG_PREOP_F64()
+    {
+    } // ~Inst_VOP3__V_TRIG_PREOP_F64
+
+    // --- description from .arch file ---
+    // D.d = Look Up 2/PI (S0.d) with segment select S1.u[4:0]. This operation
+    // returns an aligned, double precision segment of 2/PI needed to do range
+    // reduction on S0.d (double-precision value). Multiple segments can be
+    // specified through S1.u[4:0]. Rounding is always round-to-zero. Large
+    // inputs (exp > 1968) are scaled to avoid loss of precision through
+    // denormalization.
+    void
+    Inst_VOP3__V_TRIG_PREOP_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_BFM_B32 class methods ---
+
+    Inst_VOP3__V_BFM_B32::Inst_VOP3__V_BFM_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_bfm_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_BFM_B32
+
+    Inst_VOP3__V_BFM_B32::~Inst_VOP3__V_BFM_B32()
+    {
+    } // ~Inst_VOP3__V_BFM_B32
+
+    // --- description from .arch file ---
+    // D.u = ((1<<S0.u[4:0])-1) << S1.u[4:0]; S0 is the bitfield width and S1
+    // is the bitfield offset.
+    void
+    Inst_VOP3__V_BFM_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = ((1 << bits(src0[lane], 4, 0)) - 1)
+                    << bits(src1[lane], 4, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_PKNORM_I16_F32 class methods ---
+
+    Inst_VOP3__V_CVT_PKNORM_I16_F32::Inst_VOP3__V_CVT_PKNORM_I16_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_pknorm_i16_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_PKNORM_I16_F32
+
+    Inst_VOP3__V_CVT_PKNORM_I16_F32::~Inst_VOP3__V_CVT_PKNORM_I16_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_PKNORM_I16_F32
+
+    // --- description from .arch file ---
+    // D = {(snorm)S1.f, (snorm)S0.f}.
+    void
+    Inst_VOP3__V_CVT_PKNORM_I16_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_PKNORM_U16_F32 class methods ---
+
+    Inst_VOP3__V_CVT_PKNORM_U16_F32::Inst_VOP3__V_CVT_PKNORM_U16_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_pknorm_u16_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_PKNORM_U16_F32
+
+    Inst_VOP3__V_CVT_PKNORM_U16_F32::~Inst_VOP3__V_CVT_PKNORM_U16_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_PKNORM_U16_F32
+
+    // --- description from .arch file ---
+    // D = {(unorm)S1.f, (unorm)S0.f}.
+    void
+    Inst_VOP3__V_CVT_PKNORM_U16_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_PKRTZ_F16_F32 class methods ---
+
+    Inst_VOP3__V_CVT_PKRTZ_F16_F32::Inst_VOP3__V_CVT_PKRTZ_F16_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_pkrtz_f16_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_PKRTZ_F16_F32
+
+    Inst_VOP3__V_CVT_PKRTZ_F16_F32::~Inst_VOP3__V_CVT_PKRTZ_F16_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_PKRTZ_F16_F32
+
+    // --- description from .arch file ---
+    // D = {flt32_to_flt16(S1.f),flt32_to_flt16(S0.f)}, with round-toward-zero
+    // ---  regardless of current round mode setting in hardware.
+    // This opcode is intended for use with 16-bit compressed exports.
+    // See V_CVT_F16_F32 for a version that respects the current rounding mode.
+    void
+    Inst_VOP3__V_CVT_PKRTZ_F16_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_PK_U16_U32 class methods ---
+
+    Inst_VOP3__V_CVT_PK_U16_U32::Inst_VOP3__V_CVT_PK_U16_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_pk_u16_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CVT_PK_U16_U32
+
+    Inst_VOP3__V_CVT_PK_U16_U32::~Inst_VOP3__V_CVT_PK_U16_U32()
+    {
+    } // ~Inst_VOP3__V_CVT_PK_U16_U32
+
+    // --- description from .arch file ---
+    // D = {uint32_to_uint16(S1.u), uint32_to_uint16(S0.u)}.
+    void
+    Inst_VOP3__V_CVT_PK_U16_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_PK_I16_I32 class methods ---
+
+    Inst_VOP3__V_CVT_PK_I16_I32::Inst_VOP3__V_CVT_PK_I16_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_pk_i16_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CVT_PK_I16_I32
+
+    Inst_VOP3__V_CVT_PK_I16_I32::~Inst_VOP3__V_CVT_PK_I16_I32()
+    {
+    } // ~Inst_VOP3__V_CVT_PK_I16_I32
+
+    // --- description from .arch file ---
+    // D = {int32_to_int16(S1.i), int32_to_int16(S0.i)}.
+    void
+    Inst_VOP3__V_CVT_PK_I16_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/vop3_cmp.cc b/src/arch/amdgpu/vega/insts/vop3_cmp.cc
new file mode 100644
index 0000000000..4bbec930e6
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/vop3_cmp.cc
@@ -0,0 +1,8145 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/inst_util.hh"
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_VOP3__V_CMP_CLASS_F32 class methods ---
+
+    Inst_VOP3__V_CMP_CLASS_F32::Inst_VOP3__V_CMP_CLASS_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_class_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_CLASS_F32
+
+    Inst_VOP3__V_CMP_CLASS_F32::~Inst_VOP3__V_CMP_CLASS_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_CLASS_F32
+
+    // --- description from .arch file ---
+    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f
+    // The function reports true if the floating point value is *any* of the
+    // ---  numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOP3__V_CMP_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
+                    // is NaN
+                    if (std::isnan(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 2)) {
+                    // is -infinity
+                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 3)) {
+                    // is -normal
+                    if (std::isnormal(src0[lane])
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 4)) {
+                    // is -denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 5)) {
+                    // is -zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 6)) {
+                    // is +zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 7)) {
+                    // is +denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 8)) {
+                    // is +normal
+                    if (std::isnormal(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 9)) {
+                    // is +infinity
+                    if (std::isinf(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_CLASS_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_CLASS_F32::Inst_VOP3__V_CMPX_CLASS_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_class_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_CLASS_F32
+
+    Inst_VOP3__V_CMPX_CLASS_F32::~Inst_VOP3__V_CMPX_CLASS_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_CLASS_F32
+
+    // --- description from .arch file ---
+    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
+    // S0.f
+    // The function reports true if the floating point value is *any* of the
+    // numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOP3__V_CMPX_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
+                    // is NaN
+                    if (std::isnan(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 2)) {
+                    // is -infinity
+                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 3)) {
+                    // is -normal
+                    if (std::isnormal(src0[lane])
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 4)) {
+                    // is -denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 5)) {
+                    // is -zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 6)) {
+                    // is +zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 7)) {
+                    // is +denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 8)) {
+                    // is +normal
+                    if (std::isnormal(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 9)) {
+                    // is +infinity
+                    if (std::isinf(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_CLASS_F64 class methods ---
+
+    Inst_VOP3__V_CMP_CLASS_F64::Inst_VOP3__V_CMP_CLASS_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_class_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_CLASS_F64
+
+    Inst_VOP3__V_CMP_CLASS_F64::~Inst_VOP3__V_CMP_CLASS_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_CLASS_F64
+
+    // --- description from .arch file ---
+    // VCC = IEEE numeric class function specified in S1.u, performed on S0.d
+    // The function reports true if the floating point value is *any* of the
+    // ---  numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOP3__V_CMP_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
+                    // is NaN
+                    if (std::isnan(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 2)) {
+                    // is -infinity
+                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 3)) {
+                    // is -normal
+                    if (std::isnormal(src0[lane])
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 4)) {
+                    // is -denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 5)) {
+                    // is -zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 6)) {
+                    // is +zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 7)) {
+                    // is +denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 8)) {
+                    // is +normal
+                    if (std::isnormal(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 9)) {
+                    // is +infinity
+                    if (std::isinf(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_CLASS_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_CLASS_F64::Inst_VOP3__V_CMPX_CLASS_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_class_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_CLASS_F64
+
+    Inst_VOP3__V_CMPX_CLASS_F64::~Inst_VOP3__V_CMPX_CLASS_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_CLASS_F64
+
+    // --- description from .arch file ---
+    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
+    // S0.d
+    // The function reports true if the floating point value is *any* of the
+    // numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOP3__V_CMPX_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
+                    // is NaN
+                    if (std::isnan(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 2)) {
+                    // is -infinity
+                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 3)) {
+                    // is -normal
+                    if (std::isnormal(src0[lane])
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 4)) {
+                    // is -denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 5)) {
+                    // is -zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 6)) {
+                    // is +zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 7)) {
+                    // is +denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 8)) {
+                    // is +normal
+                    if (std::isnormal(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 9)) {
+                    // is +infinity
+                    if (std::isinf(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_CLASS_F16 class methods ---
+
+    Inst_VOP3__V_CMP_CLASS_F16::Inst_VOP3__V_CMP_CLASS_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_class_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_CLASS_F16
+
+    Inst_VOP3__V_CMP_CLASS_F16::~Inst_VOP3__V_CMP_CLASS_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_CLASS_F16
+
+    // --- description from .arch file ---
+    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f16
+    // The function reports true if the floating point value is *any* of the
+    // ---  numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOP3__V_CMP_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_CLASS_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_CLASS_F16::Inst_VOP3__V_CMPX_CLASS_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_class_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_CLASS_F16
+
+    Inst_VOP3__V_CMPX_CLASS_F16::~Inst_VOP3__V_CMPX_CLASS_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_CLASS_F16
+
+    // --- description from .arch file ---
+    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
+    // ---  S0.f16
+    // The function reports true if the floating point value is *any* of the
+    // ---  numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOP3__V_CMPX_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_F_F16 class methods ---
+
+    Inst_VOP3__V_CMP_F_F16::Inst_VOP3__V_CMP_F_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_f_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_F_F16
+
+    Inst_VOP3__V_CMP_F_F16::~Inst_VOP3__V_CMP_F_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_F_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_F_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LT_F16 class methods ---
+
+    Inst_VOP3__V_CMP_LT_F16::Inst_VOP3__V_CMP_LT_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lt_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_LT_F16
+
+    Inst_VOP3__V_CMP_LT_F16::~Inst_VOP3__V_CMP_LT_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_LT_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_EQ_F16 class methods ---
+
+    Inst_VOP3__V_CMP_EQ_F16::Inst_VOP3__V_CMP_EQ_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_eq_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_EQ_F16
+
+    Inst_VOP3__V_CMP_EQ_F16::~Inst_VOP3__V_CMP_EQ_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_EQ_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LE_F16 class methods ---
+
+    Inst_VOP3__V_CMP_LE_F16::Inst_VOP3__V_CMP_LE_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_le_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_LE_F16
+
+    Inst_VOP3__V_CMP_LE_F16::~Inst_VOP3__V_CMP_LE_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_LE_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GT_F16 class methods ---
+
+    Inst_VOP3__V_CMP_GT_F16::Inst_VOP3__V_CMP_GT_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_gt_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_GT_F16
+
+    Inst_VOP3__V_CMP_GT_F16::~Inst_VOP3__V_CMP_GT_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_GT_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LG_F16 class methods ---
+
+    Inst_VOP3__V_CMP_LG_F16::Inst_VOP3__V_CMP_LG_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lg_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_LG_F16
+
+    Inst_VOP3__V_CMP_LG_F16::~Inst_VOP3__V_CMP_LG_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_LG_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GE_F16 class methods ---
+
+    Inst_VOP3__V_CMP_GE_F16::Inst_VOP3__V_CMP_GE_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ge_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_GE_F16
+
+    Inst_VOP3__V_CMP_GE_F16::~Inst_VOP3__V_CMP_GE_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_GE_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_O_F16 class methods ---
+
+    Inst_VOP3__V_CMP_O_F16::Inst_VOP3__V_CMP_O_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_o_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_O_F16
+
+    Inst_VOP3__V_CMP_O_F16::~Inst_VOP3__V_CMP_O_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_O_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_O_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_U_F16 class methods ---
+
+    Inst_VOP3__V_CMP_U_F16::Inst_VOP3__V_CMP_U_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_u_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_U_F16
+
+    Inst_VOP3__V_CMP_U_F16::~Inst_VOP3__V_CMP_U_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_U_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_U_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NGE_F16 class methods ---
+
+    Inst_VOP3__V_CMP_NGE_F16::Inst_VOP3__V_CMP_NGE_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nge_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_NGE_F16
+
+    Inst_VOP3__V_CMP_NGE_F16::~Inst_VOP3__V_CMP_NGE_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_NGE_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NLG_F16 class methods ---
+
+    Inst_VOP3__V_CMP_NLG_F16::Inst_VOP3__V_CMP_NLG_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nlg_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_NLG_F16
+
+    Inst_VOP3__V_CMP_NLG_F16::~Inst_VOP3__V_CMP_NLG_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_NLG_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NGT_F16 class methods ---
+
+    Inst_VOP3__V_CMP_NGT_F16::Inst_VOP3__V_CMP_NGT_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ngt_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_NGT_F16
+
+    Inst_VOP3__V_CMP_NGT_F16::~Inst_VOP3__V_CMP_NGT_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_NGT_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NLE_F16 class methods ---
+
+    Inst_VOP3__V_CMP_NLE_F16::Inst_VOP3__V_CMP_NLE_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nle_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_NLE_F16
+
+    Inst_VOP3__V_CMP_NLE_F16::~Inst_VOP3__V_CMP_NLE_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_NLE_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NEQ_F16 class methods ---
+
+    Inst_VOP3__V_CMP_NEQ_F16::Inst_VOP3__V_CMP_NEQ_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_neq_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_NEQ_F16
+
+    Inst_VOP3__V_CMP_NEQ_F16::~Inst_VOP3__V_CMP_NEQ_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_NEQ_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NLT_F16 class methods ---
+
+    Inst_VOP3__V_CMP_NLT_F16::Inst_VOP3__V_CMP_NLT_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nlt_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_NLT_F16
+
+    Inst_VOP3__V_CMP_NLT_F16::~Inst_VOP3__V_CMP_NLT_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_NLT_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_TRU_F16 class methods ---
+
+    Inst_VOP3__V_CMP_TRU_F16::Inst_VOP3__V_CMP_TRU_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_tru_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_TRU_F16
+
+    Inst_VOP3__V_CMP_TRU_F16::~Inst_VOP3__V_CMP_TRU_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_TRU_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_F_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_F_F16::Inst_VOP3__V_CMPX_F_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_f_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_F_F16
+
+    Inst_VOP3__V_CMPX_F_F16::~Inst_VOP3__V_CMPX_F_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_F_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_F_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LT_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_LT_F16::Inst_VOP3__V_CMPX_LT_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lt_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LT_F16
+
+    Inst_VOP3__V_CMPX_LT_F16::~Inst_VOP3__V_CMPX_LT_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_LT_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_EQ_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_EQ_F16::Inst_VOP3__V_CMPX_EQ_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_eq_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_EQ_F16
+
+    Inst_VOP3__V_CMPX_EQ_F16::~Inst_VOP3__V_CMPX_EQ_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_EQ_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LE_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_LE_F16::Inst_VOP3__V_CMPX_LE_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_le_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LE_F16
+
+    Inst_VOP3__V_CMPX_LE_F16::~Inst_VOP3__V_CMPX_LE_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_LE_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GT_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_GT_F16::Inst_VOP3__V_CMPX_GT_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_gt_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GT_F16
+
+    Inst_VOP3__V_CMPX_GT_F16::~Inst_VOP3__V_CMPX_GT_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_GT_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LG_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_LG_F16::Inst_VOP3__V_CMPX_LG_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lg_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LG_F16
+
+    Inst_VOP3__V_CMPX_LG_F16::~Inst_VOP3__V_CMPX_LG_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_LG_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GE_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_GE_F16::Inst_VOP3__V_CMPX_GE_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ge_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GE_F16
+
+    Inst_VOP3__V_CMPX_GE_F16::~Inst_VOP3__V_CMPX_GE_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_GE_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_O_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_O_F16::Inst_VOP3__V_CMPX_O_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_o_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_O_F16
+
+    Inst_VOP3__V_CMPX_O_F16::~Inst_VOP3__V_CMPX_O_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_O_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOP3__V_CMPX_O_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_U_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_U_F16::Inst_VOP3__V_CMPX_U_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_u_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_U_F16
+
+    Inst_VOP3__V_CMPX_U_F16::~Inst_VOP3__V_CMPX_U_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_U_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOP3__V_CMPX_U_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NGE_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_NGE_F16::Inst_VOP3__V_CMPX_NGE_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nge_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NGE_F16
+
+    Inst_VOP3__V_CMPX_NGE_F16::~Inst_VOP3__V_CMPX_NGE_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_NGE_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NLG_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_NLG_F16::Inst_VOP3__V_CMPX_NLG_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nlg_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NLG_F16
+
+    Inst_VOP3__V_CMPX_NLG_F16::~Inst_VOP3__V_CMPX_NLG_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_NLG_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NGT_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_NGT_F16::Inst_VOP3__V_CMPX_NGT_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ngt_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NGT_F16
+
+    Inst_VOP3__V_CMPX_NGT_F16::~Inst_VOP3__V_CMPX_NGT_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_NGT_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NLE_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_NLE_F16::Inst_VOP3__V_CMPX_NLE_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nle_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NLE_F16
+
+    Inst_VOP3__V_CMPX_NLE_F16::~Inst_VOP3__V_CMPX_NLE_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_NLE_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NEQ_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_NEQ_F16::Inst_VOP3__V_CMPX_NEQ_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_neq_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NEQ_F16
+
+    Inst_VOP3__V_CMPX_NEQ_F16::~Inst_VOP3__V_CMPX_NEQ_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_NEQ_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NLT_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_NLT_F16::Inst_VOP3__V_CMPX_NLT_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nlt_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NLT_F16
+
+    Inst_VOP3__V_CMPX_NLT_F16::~Inst_VOP3__V_CMPX_NLT_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_NLT_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_TRU_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_TRU_F16::Inst_VOP3__V_CMPX_TRU_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_tru_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_TRU_F16
+
+    Inst_VOP3__V_CMPX_TRU_F16::~Inst_VOP3__V_CMPX_TRU_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_TRU_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_F_F32 class methods ---
+
+    Inst_VOP3__V_CMP_F_F32::Inst_VOP3__V_CMP_F_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_f_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_F_F32
+
+    Inst_VOP3__V_CMP_F_F32::~Inst_VOP3__V_CMP_F_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_F_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_F_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LT_F32 class methods ---
+
+    Inst_VOP3__V_CMP_LT_F32::Inst_VOP3__V_CMP_LT_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lt_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_LT_F32
+
+    Inst_VOP3__V_CMP_LT_F32::~Inst_VOP3__V_CMP_LT_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_LT_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_EQ_F32 class methods ---
+
+    Inst_VOP3__V_CMP_EQ_F32::Inst_VOP3__V_CMP_EQ_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_eq_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_EQ_F32
+
+    Inst_VOP3__V_CMP_EQ_F32::~Inst_VOP3__V_CMP_EQ_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_EQ_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LE_F32 class methods ---
+
+    Inst_VOP3__V_CMP_LE_F32::Inst_VOP3__V_CMP_LE_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_le_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_LE_F32
+
+    Inst_VOP3__V_CMP_LE_F32::~Inst_VOP3__V_CMP_LE_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_LE_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GT_F32 class methods ---
+
+    Inst_VOP3__V_CMP_GT_F32::Inst_VOP3__V_CMP_GT_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_gt_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_GT_F32
+
+    Inst_VOP3__V_CMP_GT_F32::~Inst_VOP3__V_CMP_GT_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_GT_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LG_F32 class methods ---
+
+    Inst_VOP3__V_CMP_LG_F32::Inst_VOP3__V_CMP_LG_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lg_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_LG_F32
+
+    Inst_VOP3__V_CMP_LG_F32::~Inst_VOP3__V_CMP_LG_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_LG_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GE_F32 class methods ---
+
+    Inst_VOP3__V_CMP_GE_F32::Inst_VOP3__V_CMP_GE_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ge_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_GE_F32
+
+    Inst_VOP3__V_CMP_GE_F32::~Inst_VOP3__V_CMP_GE_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_GE_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_O_F32 class methods ---
+
+    Inst_VOP3__V_CMP_O_F32::Inst_VOP3__V_CMP_O_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_o_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_O_F32
+
+    Inst_VOP3__V_CMP_O_F32::~Inst_VOP3__V_CMP_O_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_O_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_O_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (!std::isnan(src0[lane])
+                    && !std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_U_F32 class methods ---
+
+    Inst_VOP3__V_CMP_U_F32::Inst_VOP3__V_CMP_U_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_u_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_U_F32
+
+    Inst_VOP3__V_CMP_U_F32::~Inst_VOP3__V_CMP_U_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_U_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_U_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (std::isnan(src0[lane])
+                    || std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NGE_F32 class methods ---
+
+    Inst_VOP3__V_CMP_NGE_F32::Inst_VOP3__V_CMP_NGE_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nge_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_NGE_F32
+
+    Inst_VOP3__V_CMP_NGE_F32::~Inst_VOP3__V_CMP_NGE_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_NGE_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NLG_F32 class methods ---
+
+    Inst_VOP3__V_CMP_NLG_F32::Inst_VOP3__V_CMP_NLG_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nlg_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_NLG_F32
+
+    Inst_VOP3__V_CMP_NLG_F32::~Inst_VOP3__V_CMP_NLG_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_NLG_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NGT_F32 class methods ---
+
+    Inst_VOP3__V_CMP_NGT_F32::Inst_VOP3__V_CMP_NGT_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ngt_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_NGT_F32
+
+    Inst_VOP3__V_CMP_NGT_F32::~Inst_VOP3__V_CMP_NGT_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_NGT_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NLE_F32 class methods ---
+
+    Inst_VOP3__V_CMP_NLE_F32::Inst_VOP3__V_CMP_NLE_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nle_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_NLE_F32
+
+    Inst_VOP3__V_CMP_NLE_F32::~Inst_VOP3__V_CMP_NLE_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_NLE_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NEQ_F32 class methods ---
+
+    Inst_VOP3__V_CMP_NEQ_F32::Inst_VOP3__V_CMP_NEQ_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_neq_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_NEQ_F32
+
+    Inst_VOP3__V_CMP_NEQ_F32::~Inst_VOP3__V_CMP_NEQ_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_NEQ_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NLT_F32 class methods ---
+
+    Inst_VOP3__V_CMP_NLT_F32::Inst_VOP3__V_CMP_NLT_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nlt_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_NLT_F32
+
+    Inst_VOP3__V_CMP_NLT_F32::~Inst_VOP3__V_CMP_NLT_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_NLT_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_TRU_F32 class methods ---
+
+    Inst_VOP3__V_CMP_TRU_F32::Inst_VOP3__V_CMP_TRU_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_tru_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_TRU_F32
+
+    Inst_VOP3__V_CMP_TRU_F32::~Inst_VOP3__V_CMP_TRU_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_TRU_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_F_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_F_F32::Inst_VOP3__V_CMPX_F_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_f_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_F_F32
+
+    Inst_VOP3__V_CMPX_F_F32::~Inst_VOP3__V_CMPX_F_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_F_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_F_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LT_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_LT_F32::Inst_VOP3__V_CMPX_LT_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lt_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LT_F32
+
+    Inst_VOP3__V_CMPX_LT_F32::~Inst_VOP3__V_CMPX_LT_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_LT_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_EQ_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_EQ_F32::Inst_VOP3__V_CMPX_EQ_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_eq_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_EQ_F32
+
+    Inst_VOP3__V_CMPX_EQ_F32::~Inst_VOP3__V_CMPX_EQ_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_EQ_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LE_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_LE_F32::Inst_VOP3__V_CMPX_LE_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_le_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LE_F32
+
+    Inst_VOP3__V_CMPX_LE_F32::~Inst_VOP3__V_CMPX_LE_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_LE_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GT_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_GT_F32::Inst_VOP3__V_CMPX_GT_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_gt_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GT_F32
+
+    Inst_VOP3__V_CMPX_GT_F32::~Inst_VOP3__V_CMPX_GT_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_GT_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LG_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_LG_F32::Inst_VOP3__V_CMPX_LG_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lg_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LG_F32
+
+    Inst_VOP3__V_CMPX_LG_F32::~Inst_VOP3__V_CMPX_LG_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_LG_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GE_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_GE_F32::Inst_VOP3__V_CMPX_GE_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ge_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GE_F32
+
+    Inst_VOP3__V_CMPX_GE_F32::~Inst_VOP3__V_CMPX_GE_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_GE_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_O_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_O_F32::Inst_VOP3__V_CMPX_O_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_o_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_O_F32
+
+    Inst_VOP3__V_CMPX_O_F32::~Inst_VOP3__V_CMPX_O_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_O_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOP3__V_CMPX_O_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (!std::isnan(src0[lane])
+                    && !std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_U_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_U_F32::Inst_VOP3__V_CMPX_U_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_u_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_U_F32
+
+    Inst_VOP3__V_CMPX_U_F32::~Inst_VOP3__V_CMPX_U_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_U_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOP3__V_CMPX_U_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (std::isnan(src0[lane])
+                        || std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NGE_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_NGE_F32::Inst_VOP3__V_CMPX_NGE_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nge_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NGE_F32
+
+    Inst_VOP3__V_CMPX_NGE_F32::~Inst_VOP3__V_CMPX_NGE_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_NGE_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NLG_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_NLG_F32::Inst_VOP3__V_CMPX_NLG_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nlg_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NLG_F32
+
+    Inst_VOP3__V_CMPX_NLG_F32::~Inst_VOP3__V_CMPX_NLG_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_NLG_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NGT_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_NGT_F32::Inst_VOP3__V_CMPX_NGT_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ngt_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NGT_F32
+
+    Inst_VOP3__V_CMPX_NGT_F32::~Inst_VOP3__V_CMPX_NGT_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_NGT_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NLE_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_NLE_F32::Inst_VOP3__V_CMPX_NLE_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nle_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NLE_F32
+
+    Inst_VOP3__V_CMPX_NLE_F32::~Inst_VOP3__V_CMPX_NLE_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_NLE_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NEQ_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_NEQ_F32::Inst_VOP3__V_CMPX_NEQ_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_neq_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NEQ_F32
+
+    Inst_VOP3__V_CMPX_NEQ_F32::~Inst_VOP3__V_CMPX_NEQ_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_NEQ_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NLT_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_NLT_F32::Inst_VOP3__V_CMPX_NLT_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nlt_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NLT_F32
+
+    Inst_VOP3__V_CMPX_NLT_F32::~Inst_VOP3__V_CMPX_NLT_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_NLT_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_TRU_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_TRU_F32::Inst_VOP3__V_CMPX_TRU_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_tru_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_TRU_F32
+
+    Inst_VOP3__V_CMPX_TRU_F32::~Inst_VOP3__V_CMPX_TRU_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_TRU_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_F_F64 class methods ---
+
+    Inst_VOP3__V_CMP_F_F64::Inst_VOP3__V_CMP_F_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_f_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_F_F64
+
+    Inst_VOP3__V_CMP_F_F64::~Inst_VOP3__V_CMP_F_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_F_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_F_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LT_F64 class methods ---
+
+    Inst_VOP3__V_CMP_LT_F64::Inst_VOP3__V_CMP_LT_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lt_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_LT_F64
+
+    Inst_VOP3__V_CMP_LT_F64::~Inst_VOP3__V_CMP_LT_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_LT_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_EQ_F64 class methods ---
+
+    Inst_VOP3__V_CMP_EQ_F64::Inst_VOP3__V_CMP_EQ_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_eq_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_EQ_F64
+
+    Inst_VOP3__V_CMP_EQ_F64::~Inst_VOP3__V_CMP_EQ_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_EQ_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LE_F64 class methods ---
+
+    Inst_VOP3__V_CMP_LE_F64::Inst_VOP3__V_CMP_LE_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_le_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_LE_F64
+
+    Inst_VOP3__V_CMP_LE_F64::~Inst_VOP3__V_CMP_LE_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_LE_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GT_F64 class methods ---
+
+    Inst_VOP3__V_CMP_GT_F64::Inst_VOP3__V_CMP_GT_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_gt_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_GT_F64
+
+    Inst_VOP3__V_CMP_GT_F64::~Inst_VOP3__V_CMP_GT_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_GT_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LG_F64 class methods ---
+
+    Inst_VOP3__V_CMP_LG_F64::Inst_VOP3__V_CMP_LG_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lg_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_LG_F64
+
+    Inst_VOP3__V_CMP_LG_F64::~Inst_VOP3__V_CMP_LG_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_LG_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LG_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GE_F64 class methods ---
+
+    Inst_VOP3__V_CMP_GE_F64::Inst_VOP3__V_CMP_GE_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ge_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_GE_F64
+
+    Inst_VOP3__V_CMP_GE_F64::~Inst_VOP3__V_CMP_GE_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_GE_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_O_F64 class methods ---
+
+    Inst_VOP3__V_CMP_O_F64::Inst_VOP3__V_CMP_O_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_o_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_O_F64
+
+    Inst_VOP3__V_CMP_O_F64::~Inst_VOP3__V_CMP_O_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_O_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_O_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (!std::isnan(src0[lane])
+                    && !std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_U_F64 class methods ---
+
+    Inst_VOP3__V_CMP_U_F64::Inst_VOP3__V_CMP_U_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_u_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_U_F64
+
+    Inst_VOP3__V_CMP_U_F64::~Inst_VOP3__V_CMP_U_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_U_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_U_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (std::isnan(src0[lane])
+                    || std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NGE_F64 class methods ---
+
+    Inst_VOP3__V_CMP_NGE_F64::Inst_VOP3__V_CMP_NGE_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nge_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_NGE_F64
+
+    Inst_VOP3__V_CMP_NGE_F64::~Inst_VOP3__V_CMP_NGE_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_NGE_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NLG_F64 class methods ---
+
+    Inst_VOP3__V_CMP_NLG_F64::Inst_VOP3__V_CMP_NLG_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nlg_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_NLG_F64
+
+    Inst_VOP3__V_CMP_NLG_F64::~Inst_VOP3__V_CMP_NLG_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_NLG_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NGT_F64 class methods ---
+
+    Inst_VOP3__V_CMP_NGT_F64::Inst_VOP3__V_CMP_NGT_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ngt_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_NGT_F64
+
+    Inst_VOP3__V_CMP_NGT_F64::~Inst_VOP3__V_CMP_NGT_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_NGT_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NLE_F64 class methods ---
+
+    Inst_VOP3__V_CMP_NLE_F64::Inst_VOP3__V_CMP_NLE_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nle_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_NLE_F64
+
+    Inst_VOP3__V_CMP_NLE_F64::~Inst_VOP3__V_CMP_NLE_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_NLE_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NEQ_F64 class methods ---
+
+    Inst_VOP3__V_CMP_NEQ_F64::Inst_VOP3__V_CMP_NEQ_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_neq_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_NEQ_F64
+
+    Inst_VOP3__V_CMP_NEQ_F64::~Inst_VOP3__V_CMP_NEQ_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_NEQ_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NLT_F64 class methods ---
+
+    Inst_VOP3__V_CMP_NLT_F64::Inst_VOP3__V_CMP_NLT_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nlt_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_NLT_F64
+
+    Inst_VOP3__V_CMP_NLT_F64::~Inst_VOP3__V_CMP_NLT_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_NLT_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_TRU_F64 class methods ---
+
+    Inst_VOP3__V_CMP_TRU_F64::Inst_VOP3__V_CMP_TRU_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_tru_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_TRU_F64
+
+    Inst_VOP3__V_CMP_TRU_F64::~Inst_VOP3__V_CMP_TRU_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_TRU_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_F_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_F_F64::Inst_VOP3__V_CMPX_F_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_f_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_F_F64
+
+    Inst_VOP3__V_CMPX_F_F64::~Inst_VOP3__V_CMPX_F_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_F_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_F_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LT_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_LT_F64::Inst_VOP3__V_CMPX_LT_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lt_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LT_F64
+
+    Inst_VOP3__V_CMPX_LT_F64::~Inst_VOP3__V_CMPX_LT_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_LT_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_EQ_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_EQ_F64::Inst_VOP3__V_CMPX_EQ_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_eq_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_EQ_F64
+
+    Inst_VOP3__V_CMPX_EQ_F64::~Inst_VOP3__V_CMPX_EQ_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_EQ_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LE_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_LE_F64::Inst_VOP3__V_CMPX_LE_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_le_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LE_F64
+
+    Inst_VOP3__V_CMPX_LE_F64::~Inst_VOP3__V_CMPX_LE_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_LE_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GT_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_GT_F64::Inst_VOP3__V_CMPX_GT_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_gt_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GT_F64
+
+    Inst_VOP3__V_CMPX_GT_F64::~Inst_VOP3__V_CMPX_GT_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_GT_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LG_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_LG_F64::Inst_VOP3__V_CMPX_LG_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lg_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LG_F64
+
+    Inst_VOP3__V_CMPX_LG_F64::~Inst_VOP3__V_CMPX_LG_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_LG_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LG_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GE_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_GE_F64::Inst_VOP3__V_CMPX_GE_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ge_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GE_F64
+
+    Inst_VOP3__V_CMPX_GE_F64::~Inst_VOP3__V_CMPX_GE_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_GE_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_O_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_O_F64::Inst_VOP3__V_CMPX_O_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_o_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_O_F64
+
+    Inst_VOP3__V_CMPX_O_F64::~Inst_VOP3__V_CMPX_O_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_O_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOP3__V_CMPX_O_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (!std::isnan(src0[lane])
+                    && !std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_U_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_U_F64::Inst_VOP3__V_CMPX_U_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_u_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_U_F64
+
+    Inst_VOP3__V_CMPX_U_F64::~Inst_VOP3__V_CMPX_U_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_U_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOP3__V_CMPX_U_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (std::isnan(src0[lane])
+                    || std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NGE_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_NGE_F64::Inst_VOP3__V_CMPX_NGE_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nge_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NGE_F64
+
+    Inst_VOP3__V_CMPX_NGE_F64::~Inst_VOP3__V_CMPX_NGE_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_NGE_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NLG_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_NLG_F64::Inst_VOP3__V_CMPX_NLG_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nlg_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NLG_F64
+
+    Inst_VOP3__V_CMPX_NLG_F64::~Inst_VOP3__V_CMPX_NLG_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_NLG_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NGT_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_NGT_F64::Inst_VOP3__V_CMPX_NGT_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ngt_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NGT_F64
+
+    Inst_VOP3__V_CMPX_NGT_F64::~Inst_VOP3__V_CMPX_NGT_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_NGT_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NLE_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_NLE_F64::Inst_VOP3__V_CMPX_NLE_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nle_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NLE_F64
+
+    Inst_VOP3__V_CMPX_NLE_F64::~Inst_VOP3__V_CMPX_NLE_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_NLE_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NEQ_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_NEQ_F64::Inst_VOP3__V_CMPX_NEQ_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_neq_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NEQ_F64
+
+    Inst_VOP3__V_CMPX_NEQ_F64::~Inst_VOP3__V_CMPX_NEQ_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_NEQ_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NLT_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_NLT_F64::Inst_VOP3__V_CMPX_NLT_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nlt_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NLT_F64
+
+    Inst_VOP3__V_CMPX_NLT_F64::~Inst_VOP3__V_CMPX_NLT_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_NLT_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_TRU_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_TRU_F64::Inst_VOP3__V_CMPX_TRU_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_tru_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_TRU_F64
+
+    Inst_VOP3__V_CMPX_TRU_F64::~Inst_VOP3__V_CMPX_TRU_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_TRU_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_F_I16 class methods ---
+
+    Inst_VOP3__V_CMP_F_I16::Inst_VOP3__V_CMP_F_I16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_f_i16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_F_I16
+
+    Inst_VOP3__V_CMP_F_I16::~Inst_VOP3__V_CMP_F_I16()
+    {
+    } // ~Inst_VOP3__V_CMP_F_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_F_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LT_I16 class methods ---
+
+    Inst_VOP3__V_CMP_LT_I16::Inst_VOP3__V_CMP_LT_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lt_i16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LT_I16
+
+    Inst_VOP3__V_CMP_LT_I16::~Inst_VOP3__V_CMP_LT_I16()
+    {
+    } // ~Inst_VOP3__V_CMP_LT_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LT_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_EQ_I16 class methods ---
+
+    Inst_VOP3__V_CMP_EQ_I16::Inst_VOP3__V_CMP_EQ_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_eq_i16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_EQ_I16
+
+    Inst_VOP3__V_CMP_EQ_I16::~Inst_VOP3__V_CMP_EQ_I16()
+    {
+    } // ~Inst_VOP3__V_CMP_EQ_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LE_I16 class methods ---
+
+    Inst_VOP3__V_CMP_LE_I16::Inst_VOP3__V_CMP_LE_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_le_i16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LE_I16
+
+    Inst_VOP3__V_CMP_LE_I16::~Inst_VOP3__V_CMP_LE_I16()
+    {
+    } // ~Inst_VOP3__V_CMP_LE_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GT_I16 class methods ---
+
+    Inst_VOP3__V_CMP_GT_I16::Inst_VOP3__V_CMP_GT_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_gt_i16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GT_I16
+
+    Inst_VOP3__V_CMP_GT_I16::~Inst_VOP3__V_CMP_GT_I16()
+    {
+    } // ~Inst_VOP3__V_CMP_GT_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GT_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NE_I16 class methods ---
+
+    Inst_VOP3__V_CMP_NE_I16::Inst_VOP3__V_CMP_NE_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ne_i16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_NE_I16
+
+    Inst_VOP3__V_CMP_NE_I16::~Inst_VOP3__V_CMP_NE_I16()
+    {
+    } // ~Inst_VOP3__V_CMP_NE_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GE_I16 class methods ---
+
+    Inst_VOP3__V_CMP_GE_I16::Inst_VOP3__V_CMP_GE_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ge_i16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GE_I16
+
+    Inst_VOP3__V_CMP_GE_I16::~Inst_VOP3__V_CMP_GE_I16()
+    {
+    } // ~Inst_VOP3__V_CMP_GE_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_T_I16 class methods ---
+
+    Inst_VOP3__V_CMP_T_I16::Inst_VOP3__V_CMP_T_I16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_t_i16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_T_I16
+
+    Inst_VOP3__V_CMP_T_I16::~Inst_VOP3__V_CMP_T_I16()
+    {
+    } // ~Inst_VOP3__V_CMP_T_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_T_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_F_U16 class methods ---
+
+    Inst_VOP3__V_CMP_F_U16::Inst_VOP3__V_CMP_F_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_f_u16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_F_U16
+
+    Inst_VOP3__V_CMP_F_U16::~Inst_VOP3__V_CMP_F_U16()
+    {
+    } // ~Inst_VOP3__V_CMP_F_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_F_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LT_U16 class methods ---
+
+    Inst_VOP3__V_CMP_LT_U16::Inst_VOP3__V_CMP_LT_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lt_u16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LT_U16
+
+    Inst_VOP3__V_CMP_LT_U16::~Inst_VOP3__V_CMP_LT_U16()
+    {
+    } // ~Inst_VOP3__V_CMP_LT_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LT_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_EQ_U16 class methods ---
+
+    Inst_VOP3__V_CMP_EQ_U16::Inst_VOP3__V_CMP_EQ_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_eq_u16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_EQ_U16
+
+    Inst_VOP3__V_CMP_EQ_U16::~Inst_VOP3__V_CMP_EQ_U16()
+    {
+    } // ~Inst_VOP3__V_CMP_EQ_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LE_U16 class methods ---
+
+    Inst_VOP3__V_CMP_LE_U16::Inst_VOP3__V_CMP_LE_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_le_u16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LE_U16
+
+    Inst_VOP3__V_CMP_LE_U16::~Inst_VOP3__V_CMP_LE_U16()
+    {
+    } // ~Inst_VOP3__V_CMP_LE_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GT_U16 class methods ---
+
+    Inst_VOP3__V_CMP_GT_U16::Inst_VOP3__V_CMP_GT_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_gt_u16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GT_U16
+
+    Inst_VOP3__V_CMP_GT_U16::~Inst_VOP3__V_CMP_GT_U16()
+    {
+    } // ~Inst_VOP3__V_CMP_GT_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GT_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NE_U16 class methods ---
+
+    Inst_VOP3__V_CMP_NE_U16::Inst_VOP3__V_CMP_NE_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ne_u16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_NE_U16
+
+    Inst_VOP3__V_CMP_NE_U16::~Inst_VOP3__V_CMP_NE_U16()
+    {
+    } // ~Inst_VOP3__V_CMP_NE_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GE_U16 class methods ---
+
+    Inst_VOP3__V_CMP_GE_U16::Inst_VOP3__V_CMP_GE_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ge_u16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GE_U16
+
+    Inst_VOP3__V_CMP_GE_U16::~Inst_VOP3__V_CMP_GE_U16()
+    {
+    } // ~Inst_VOP3__V_CMP_GE_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_T_U16 class methods ---
+
+    Inst_VOP3__V_CMP_T_U16::Inst_VOP3__V_CMP_T_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_t_u16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_T_U16
+
+    Inst_VOP3__V_CMP_T_U16::~Inst_VOP3__V_CMP_T_U16()
+    {
+    } // ~Inst_VOP3__V_CMP_T_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_T_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_F_I16 class methods ---
+
+    Inst_VOP3__V_CMPX_F_I16::Inst_VOP3__V_CMPX_F_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_f_i16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_F_I16
+
+    Inst_VOP3__V_CMPX_F_I16::~Inst_VOP3__V_CMPX_F_I16()
+    {
+    } // ~Inst_VOP3__V_CMPX_F_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_F_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LT_I16 class methods ---
+
+    Inst_VOP3__V_CMPX_LT_I16::Inst_VOP3__V_CMPX_LT_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lt_i16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LT_I16
+
+    Inst_VOP3__V_CMPX_LT_I16::~Inst_VOP3__V_CMPX_LT_I16()
+    {
+    } // ~Inst_VOP3__V_CMPX_LT_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LT_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_EQ_I16 class methods ---
+
+    Inst_VOP3__V_CMPX_EQ_I16::Inst_VOP3__V_CMPX_EQ_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_eq_i16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_EQ_I16
+
+    Inst_VOP3__V_CMPX_EQ_I16::~Inst_VOP3__V_CMPX_EQ_I16()
+    {
+    } // ~Inst_VOP3__V_CMPX_EQ_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LE_I16 class methods ---
+
+    Inst_VOP3__V_CMPX_LE_I16::Inst_VOP3__V_CMPX_LE_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_le_i16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LE_I16
+
+    Inst_VOP3__V_CMPX_LE_I16::~Inst_VOP3__V_CMPX_LE_I16()
+    {
+    } // ~Inst_VOP3__V_CMPX_LE_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GT_I16 class methods ---
+
+    Inst_VOP3__V_CMPX_GT_I16::Inst_VOP3__V_CMPX_GT_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_gt_i16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GT_I16
+
+    Inst_VOP3__V_CMPX_GT_I16::~Inst_VOP3__V_CMPX_GT_I16()
+    {
+    } // ~Inst_VOP3__V_CMPX_GT_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GT_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NE_I16 class methods ---
+
+    Inst_VOP3__V_CMPX_NE_I16::Inst_VOP3__V_CMPX_NE_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ne_i16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NE_I16
+
+    Inst_VOP3__V_CMPX_NE_I16::~Inst_VOP3__V_CMPX_NE_I16()
+    {
+    } // ~Inst_VOP3__V_CMPX_NE_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GE_I16 class methods ---
+
+    Inst_VOP3__V_CMPX_GE_I16::Inst_VOP3__V_CMPX_GE_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ge_i16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GE_I16
+
+    Inst_VOP3__V_CMPX_GE_I16::~Inst_VOP3__V_CMPX_GE_I16()
+    {
+    } // ~Inst_VOP3__V_CMPX_GE_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_T_I16 class methods ---
+
+    Inst_VOP3__V_CMPX_T_I16::Inst_VOP3__V_CMPX_T_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_t_i16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_T_I16
+
+    Inst_VOP3__V_CMPX_T_I16::~Inst_VOP3__V_CMPX_T_I16()
+    {
+    } // ~Inst_VOP3__V_CMPX_T_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_T_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_F_U16 class methods ---
+
+    Inst_VOP3__V_CMPX_F_U16::Inst_VOP3__V_CMPX_F_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_f_u16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_F_U16
+
+    Inst_VOP3__V_CMPX_F_U16::~Inst_VOP3__V_CMPX_F_U16()
+    {
+    } // ~Inst_VOP3__V_CMPX_F_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_F_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LT_U16 class methods ---
+
+    Inst_VOP3__V_CMPX_LT_U16::Inst_VOP3__V_CMPX_LT_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lt_u16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LT_U16
+
+    Inst_VOP3__V_CMPX_LT_U16::~Inst_VOP3__V_CMPX_LT_U16()
+    {
+    } // ~Inst_VOP3__V_CMPX_LT_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LT_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_EQ_U16 class methods ---
+
+    Inst_VOP3__V_CMPX_EQ_U16::Inst_VOP3__V_CMPX_EQ_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_eq_u16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_EQ_U16
+
+    Inst_VOP3__V_CMPX_EQ_U16::~Inst_VOP3__V_CMPX_EQ_U16()
+    {
+    } // ~Inst_VOP3__V_CMPX_EQ_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LE_U16 class methods ---
+
+    Inst_VOP3__V_CMPX_LE_U16::Inst_VOP3__V_CMPX_LE_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_le_u16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LE_U16
+
+    Inst_VOP3__V_CMPX_LE_U16::~Inst_VOP3__V_CMPX_LE_U16()
+    {
+    } // ~Inst_VOP3__V_CMPX_LE_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GT_U16 class methods ---
+
+    Inst_VOP3__V_CMPX_GT_U16::Inst_VOP3__V_CMPX_GT_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_gt_u16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GT_U16
+
+    Inst_VOP3__V_CMPX_GT_U16::~Inst_VOP3__V_CMPX_GT_U16()
+    {
+    } // ~Inst_VOP3__V_CMPX_GT_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GT_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NE_U16 class methods ---
+
+    Inst_VOP3__V_CMPX_NE_U16::Inst_VOP3__V_CMPX_NE_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ne_u16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NE_U16
+
+    Inst_VOP3__V_CMPX_NE_U16::~Inst_VOP3__V_CMPX_NE_U16()
+    {
+    } // ~Inst_VOP3__V_CMPX_NE_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GE_U16 class methods ---
+
+    Inst_VOP3__V_CMPX_GE_U16::Inst_VOP3__V_CMPX_GE_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ge_u16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GE_U16
+
+    Inst_VOP3__V_CMPX_GE_U16::~Inst_VOP3__V_CMPX_GE_U16()
+    {
+    } // ~Inst_VOP3__V_CMPX_GE_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_T_U16 class methods ---
+
+    Inst_VOP3__V_CMPX_T_U16::Inst_VOP3__V_CMPX_T_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_t_u16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_T_U16
+
+    Inst_VOP3__V_CMPX_T_U16::~Inst_VOP3__V_CMPX_T_U16()
+    {
+    } // ~Inst_VOP3__V_CMPX_T_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_T_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_F_I32 class methods ---
+
+    Inst_VOP3__V_CMP_F_I32::Inst_VOP3__V_CMP_F_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_f_i32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_F_I32
+
+    Inst_VOP3__V_CMP_F_I32::~Inst_VOP3__V_CMP_F_I32()
+    {
+    } // ~Inst_VOP3__V_CMP_F_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_F_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LT_I32 class methods ---
+
+    Inst_VOP3__V_CMP_LT_I32::Inst_VOP3__V_CMP_LT_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lt_i32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LT_I32
+
+    Inst_VOP3__V_CMP_LT_I32::~Inst_VOP3__V_CMP_LT_I32()
+    {
+    } // ~Inst_VOP3__V_CMP_LT_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_EQ_I32 class methods ---
+
+    Inst_VOP3__V_CMP_EQ_I32::Inst_VOP3__V_CMP_EQ_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_eq_i32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_EQ_I32
+
+    Inst_VOP3__V_CMP_EQ_I32::~Inst_VOP3__V_CMP_EQ_I32()
+    {
+    } // ~Inst_VOP3__V_CMP_EQ_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LE_I32 class methods ---
+
+    Inst_VOP3__V_CMP_LE_I32::Inst_VOP3__V_CMP_LE_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_le_i32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LE_I32
+
+    Inst_VOP3__V_CMP_LE_I32::~Inst_VOP3__V_CMP_LE_I32()
+    {
+    } // ~Inst_VOP3__V_CMP_LE_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GT_I32 class methods ---
+
+    Inst_VOP3__V_CMP_GT_I32::Inst_VOP3__V_CMP_GT_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_gt_i32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GT_I32
+
+    Inst_VOP3__V_CMP_GT_I32::~Inst_VOP3__V_CMP_GT_I32()
+    {
+    } // ~Inst_VOP3__V_CMP_GT_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NE_I32 class methods ---
+
+    Inst_VOP3__V_CMP_NE_I32::Inst_VOP3__V_CMP_NE_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ne_i32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_NE_I32
+
+    Inst_VOP3__V_CMP_NE_I32::~Inst_VOP3__V_CMP_NE_I32()
+    {
+    } // ~Inst_VOP3__V_CMP_NE_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GE_I32 class methods ---
+
+    Inst_VOP3__V_CMP_GE_I32::Inst_VOP3__V_CMP_GE_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ge_i32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GE_I32
+
+    Inst_VOP3__V_CMP_GE_I32::~Inst_VOP3__V_CMP_GE_I32()
+    {
+    } // ~Inst_VOP3__V_CMP_GE_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_T_I32 class methods ---
+
+    Inst_VOP3__V_CMP_T_I32::Inst_VOP3__V_CMP_T_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_t_i32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_T_I32
+
+    Inst_VOP3__V_CMP_T_I32::~Inst_VOP3__V_CMP_T_I32()
+    {
+    } // ~Inst_VOP3__V_CMP_T_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_T_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_F_U32 class methods ---
+
+    Inst_VOP3__V_CMP_F_U32::Inst_VOP3__V_CMP_F_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_f_u32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_F_U32
+
+    Inst_VOP3__V_CMP_F_U32::~Inst_VOP3__V_CMP_F_U32()
+    {
+    } // ~Inst_VOP3__V_CMP_F_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_F_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LT_U32 class methods ---
+
+    Inst_VOP3__V_CMP_LT_U32::Inst_VOP3__V_CMP_LT_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lt_u32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LT_U32
+
+    Inst_VOP3__V_CMP_LT_U32::~Inst_VOP3__V_CMP_LT_U32()
+    {
+    } // ~Inst_VOP3__V_CMP_LT_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_EQ_U32 class methods ---
+
+    Inst_VOP3__V_CMP_EQ_U32::Inst_VOP3__V_CMP_EQ_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_eq_u32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_EQ_U32
+
+    Inst_VOP3__V_CMP_EQ_U32::~Inst_VOP3__V_CMP_EQ_U32()
+    {
+    } // ~Inst_VOP3__V_CMP_EQ_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LE_U32 class methods ---
+
+    Inst_VOP3__V_CMP_LE_U32::Inst_VOP3__V_CMP_LE_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_le_u32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LE_U32
+
+    Inst_VOP3__V_CMP_LE_U32::~Inst_VOP3__V_CMP_LE_U32()
+    {
+    } // ~Inst_VOP3__V_CMP_LE_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GT_U32 class methods ---
+
+    Inst_VOP3__V_CMP_GT_U32::Inst_VOP3__V_CMP_GT_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_gt_u32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GT_U32
+
+    Inst_VOP3__V_CMP_GT_U32::~Inst_VOP3__V_CMP_GT_U32()
+    {
+    } // ~Inst_VOP3__V_CMP_GT_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NE_U32 class methods ---
+
+    Inst_VOP3__V_CMP_NE_U32::Inst_VOP3__V_CMP_NE_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ne_u32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_NE_U32
+
+    Inst_VOP3__V_CMP_NE_U32::~Inst_VOP3__V_CMP_NE_U32()
+    {
+    } // ~Inst_VOP3__V_CMP_NE_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GE_U32 class methods ---
+
+    Inst_VOP3__V_CMP_GE_U32::Inst_VOP3__V_CMP_GE_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ge_u32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GE_U32
+
+    Inst_VOP3__V_CMP_GE_U32::~Inst_VOP3__V_CMP_GE_U32()
+    {
+    } // ~Inst_VOP3__V_CMP_GE_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_T_U32 class methods ---
+
+    Inst_VOP3__V_CMP_T_U32::Inst_VOP3__V_CMP_T_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_t_u32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_T_U32
+
+    Inst_VOP3__V_CMP_T_U32::~Inst_VOP3__V_CMP_T_U32()
+    {
+    } // ~Inst_VOP3__V_CMP_T_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_T_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_F_I32 class methods ---
+
+    Inst_VOP3__V_CMPX_F_I32::Inst_VOP3__V_CMPX_F_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_f_i32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_F_I32
+
+    Inst_VOP3__V_CMPX_F_I32::~Inst_VOP3__V_CMPX_F_I32()
+    {
+    } // ~Inst_VOP3__V_CMPX_F_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_F_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LT_I32 class methods ---
+
+    Inst_VOP3__V_CMPX_LT_I32::Inst_VOP3__V_CMPX_LT_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lt_i32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LT_I32
+
+    Inst_VOP3__V_CMPX_LT_I32::~Inst_VOP3__V_CMPX_LT_I32()
+    {
+    } // ~Inst_VOP3__V_CMPX_LT_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_EQ_I32 class methods ---
+
+    Inst_VOP3__V_CMPX_EQ_I32::Inst_VOP3__V_CMPX_EQ_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_eq_i32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_EQ_I32
+
+    Inst_VOP3__V_CMPX_EQ_I32::~Inst_VOP3__V_CMPX_EQ_I32()
+    {
+    } // ~Inst_VOP3__V_CMPX_EQ_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LE_I32 class methods ---
+
+    Inst_VOP3__V_CMPX_LE_I32::Inst_VOP3__V_CMPX_LE_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_le_i32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LE_I32
+
+    Inst_VOP3__V_CMPX_LE_I32::~Inst_VOP3__V_CMPX_LE_I32()
+    {
+    } // ~Inst_VOP3__V_CMPX_LE_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GT_I32 class methods ---
+
+    Inst_VOP3__V_CMPX_GT_I32::Inst_VOP3__V_CMPX_GT_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_gt_i32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GT_I32
+
+    Inst_VOP3__V_CMPX_GT_I32::~Inst_VOP3__V_CMPX_GT_I32()
+    {
+    } // ~Inst_VOP3__V_CMPX_GT_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NE_I32 class methods ---
+
+    Inst_VOP3__V_CMPX_NE_I32::Inst_VOP3__V_CMPX_NE_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ne_i32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NE_I32
+
+    Inst_VOP3__V_CMPX_NE_I32::~Inst_VOP3__V_CMPX_NE_I32()
+    {
+    } // ~Inst_VOP3__V_CMPX_NE_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GE_I32 class methods ---
+
+    Inst_VOP3__V_CMPX_GE_I32::Inst_VOP3__V_CMPX_GE_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ge_i32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GE_I32
+
+    Inst_VOP3__V_CMPX_GE_I32::~Inst_VOP3__V_CMPX_GE_I32()
+    {
+    } // ~Inst_VOP3__V_CMPX_GE_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_T_I32 class methods ---
+
+    Inst_VOP3__V_CMPX_T_I32::Inst_VOP3__V_CMPX_T_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_t_i32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_T_I32
+
+    Inst_VOP3__V_CMPX_T_I32::~Inst_VOP3__V_CMPX_T_I32()
+    {
+    } // ~Inst_VOP3__V_CMPX_T_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_T_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_F_U32 class methods ---
+
+    Inst_VOP3__V_CMPX_F_U32::Inst_VOP3__V_CMPX_F_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_f_u32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_F_U32
+
+    Inst_VOP3__V_CMPX_F_U32::~Inst_VOP3__V_CMPX_F_U32()
+    {
+    } // ~Inst_VOP3__V_CMPX_F_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_F_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LT_U32 class methods ---
+
+    Inst_VOP3__V_CMPX_LT_U32::Inst_VOP3__V_CMPX_LT_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lt_u32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LT_U32
+
+    Inst_VOP3__V_CMPX_LT_U32::~Inst_VOP3__V_CMPX_LT_U32()
+    {
+    } // ~Inst_VOP3__V_CMPX_LT_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_EQ_U32 class methods ---
+
+    Inst_VOP3__V_CMPX_EQ_U32::Inst_VOP3__V_CMPX_EQ_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_eq_u32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_EQ_U32
+
+    Inst_VOP3__V_CMPX_EQ_U32::~Inst_VOP3__V_CMPX_EQ_U32()
+    {
+    } // ~Inst_VOP3__V_CMPX_EQ_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LE_U32 class methods ---
+
+    Inst_VOP3__V_CMPX_LE_U32::Inst_VOP3__V_CMPX_LE_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_le_u32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LE_U32
+
+    Inst_VOP3__V_CMPX_LE_U32::~Inst_VOP3__V_CMPX_LE_U32()
+    {
+    } // ~Inst_VOP3__V_CMPX_LE_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GT_U32 class methods ---
+
+    Inst_VOP3__V_CMPX_GT_U32::Inst_VOP3__V_CMPX_GT_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_gt_u32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GT_U32
+
+    Inst_VOP3__V_CMPX_GT_U32::~Inst_VOP3__V_CMPX_GT_U32()
+    {
+    } // ~Inst_VOP3__V_CMPX_GT_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NE_U32 class methods ---
+
+    Inst_VOP3__V_CMPX_NE_U32::Inst_VOP3__V_CMPX_NE_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ne_u32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NE_U32
+
+    Inst_VOP3__V_CMPX_NE_U32::~Inst_VOP3__V_CMPX_NE_U32()
+    {
+    } // ~Inst_VOP3__V_CMPX_NE_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GE_U32 class methods ---
+
+    Inst_VOP3__V_CMPX_GE_U32::Inst_VOP3__V_CMPX_GE_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ge_u32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GE_U32
+
+    Inst_VOP3__V_CMPX_GE_U32::~Inst_VOP3__V_CMPX_GE_U32()
+    {
+    } // ~Inst_VOP3__V_CMPX_GE_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_T_U32 class methods ---
+
+    Inst_VOP3__V_CMPX_T_U32::Inst_VOP3__V_CMPX_T_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_t_u32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_T_U32
+
+    Inst_VOP3__V_CMPX_T_U32::~Inst_VOP3__V_CMPX_T_U32()
+    {
+    } // ~Inst_VOP3__V_CMPX_T_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_T_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_F_I64 class methods ---
+
+    Inst_VOP3__V_CMP_F_I64::Inst_VOP3__V_CMP_F_I64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_f_i64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_F_I64
+
+    Inst_VOP3__V_CMP_F_I64::~Inst_VOP3__V_CMP_F_I64()
+    {
+    } // ~Inst_VOP3__V_CMP_F_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_F_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LT_I64 class methods ---
+
+    Inst_VOP3__V_CMP_LT_I64::Inst_VOP3__V_CMP_LT_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lt_i64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LT_I64
+
+    Inst_VOP3__V_CMP_LT_I64::~Inst_VOP3__V_CMP_LT_I64()
+    {
+    } // ~Inst_VOP3__V_CMP_LT_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LT_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_EQ_I64 class methods ---
+
+    Inst_VOP3__V_CMP_EQ_I64::Inst_VOP3__V_CMP_EQ_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_eq_i64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_EQ_I64
+
+    Inst_VOP3__V_CMP_EQ_I64::~Inst_VOP3__V_CMP_EQ_I64()
+    {
+    } // ~Inst_VOP3__V_CMP_EQ_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LE_I64 class methods ---
+
+    Inst_VOP3__V_CMP_LE_I64::Inst_VOP3__V_CMP_LE_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_le_i64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LE_I64
+
+    Inst_VOP3__V_CMP_LE_I64::~Inst_VOP3__V_CMP_LE_I64()
+    {
+    } // ~Inst_VOP3__V_CMP_LE_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GT_I64 class methods ---
+
+    Inst_VOP3__V_CMP_GT_I64::Inst_VOP3__V_CMP_GT_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_gt_i64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GT_I64
+
+    Inst_VOP3__V_CMP_GT_I64::~Inst_VOP3__V_CMP_GT_I64()
+    {
+    } // ~Inst_VOP3__V_CMP_GT_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GT_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NE_I64 class methods ---
+
+    Inst_VOP3__V_CMP_NE_I64::Inst_VOP3__V_CMP_NE_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ne_i64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_NE_I64
+
+    Inst_VOP3__V_CMP_NE_I64::~Inst_VOP3__V_CMP_NE_I64()
+    {
+    } // ~Inst_VOP3__V_CMP_NE_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GE_I64 class methods ---
+
+    Inst_VOP3__V_CMP_GE_I64::Inst_VOP3__V_CMP_GE_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ge_i64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GE_I64
+
+    Inst_VOP3__V_CMP_GE_I64::~Inst_VOP3__V_CMP_GE_I64()
+    {
+    } // ~Inst_VOP3__V_CMP_GE_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_T_I64 class methods ---
+
+    Inst_VOP3__V_CMP_T_I64::Inst_VOP3__V_CMP_T_I64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_t_i64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_T_I64
+
+    Inst_VOP3__V_CMP_T_I64::~Inst_VOP3__V_CMP_T_I64()
+    {
+    } // ~Inst_VOP3__V_CMP_T_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_T_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_F_U64 class methods ---
+
+    Inst_VOP3__V_CMP_F_U64::Inst_VOP3__V_CMP_F_U64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_f_u64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_F_U64
+
+    Inst_VOP3__V_CMP_F_U64::~Inst_VOP3__V_CMP_F_U64()
+    {
+    } // ~Inst_VOP3__V_CMP_F_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_F_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LT_U64 class methods ---
+
+    Inst_VOP3__V_CMP_LT_U64::Inst_VOP3__V_CMP_LT_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lt_u64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LT_U64
+
+    Inst_VOP3__V_CMP_LT_U64::~Inst_VOP3__V_CMP_LT_U64()
+    {
+    } // ~Inst_VOP3__V_CMP_LT_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LT_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_EQ_U64 class methods ---
+
+    Inst_VOP3__V_CMP_EQ_U64::Inst_VOP3__V_CMP_EQ_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_eq_u64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_EQ_U64
+
+    Inst_VOP3__V_CMP_EQ_U64::~Inst_VOP3__V_CMP_EQ_U64()
+    {
+    } // ~Inst_VOP3__V_CMP_EQ_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LE_U64 class methods ---
+
+    Inst_VOP3__V_CMP_LE_U64::Inst_VOP3__V_CMP_LE_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_le_u64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LE_U64
+
+    Inst_VOP3__V_CMP_LE_U64::~Inst_VOP3__V_CMP_LE_U64()
+    {
+    } // ~Inst_VOP3__V_CMP_LE_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GT_U64 class methods ---
+
+    Inst_VOP3__V_CMP_GT_U64::Inst_VOP3__V_CMP_GT_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_gt_u64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GT_U64
+
+    Inst_VOP3__V_CMP_GT_U64::~Inst_VOP3__V_CMP_GT_U64()
+    {
+    } // ~Inst_VOP3__V_CMP_GT_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GT_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NE_U64 class methods ---
+
+    Inst_VOP3__V_CMP_NE_U64::Inst_VOP3__V_CMP_NE_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ne_u64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_NE_U64
+
+    Inst_VOP3__V_CMP_NE_U64::~Inst_VOP3__V_CMP_NE_U64()
+    {
+    } // ~Inst_VOP3__V_CMP_NE_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GE_U64 class methods ---
+
+    Inst_VOP3__V_CMP_GE_U64::Inst_VOP3__V_CMP_GE_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ge_u64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GE_U64
+
+    Inst_VOP3__V_CMP_GE_U64::~Inst_VOP3__V_CMP_GE_U64()
+    {
+    } // ~Inst_VOP3__V_CMP_GE_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_T_U64 class methods ---
+
+    Inst_VOP3__V_CMP_T_U64::Inst_VOP3__V_CMP_T_U64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_t_u64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_T_U64
+
+    Inst_VOP3__V_CMP_T_U64::~Inst_VOP3__V_CMP_T_U64()
+    {
+    } // ~Inst_VOP3__V_CMP_T_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_T_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_F_I64 class methods ---
+
+    Inst_VOP3__V_CMPX_F_I64::Inst_VOP3__V_CMPX_F_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_f_i64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_F_I64
+
+    Inst_VOP3__V_CMPX_F_I64::~Inst_VOP3__V_CMPX_F_I64()
+    {
+    } // ~Inst_VOP3__V_CMPX_F_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_F_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LT_I64 class methods ---
+
+    Inst_VOP3__V_CMPX_LT_I64::Inst_VOP3__V_CMPX_LT_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lt_i64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LT_I64
+
+    Inst_VOP3__V_CMPX_LT_I64::~Inst_VOP3__V_CMPX_LT_I64()
+    {
+    } // ~Inst_VOP3__V_CMPX_LT_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LT_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_EQ_I64 class methods ---
+
+    Inst_VOP3__V_CMPX_EQ_I64::Inst_VOP3__V_CMPX_EQ_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_eq_i64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_EQ_I64
+
+    Inst_VOP3__V_CMPX_EQ_I64::~Inst_VOP3__V_CMPX_EQ_I64()
+    {
+    } // ~Inst_VOP3__V_CMPX_EQ_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LE_I64 class methods ---
+
+    Inst_VOP3__V_CMPX_LE_I64::Inst_VOP3__V_CMPX_LE_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_le_i64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LE_I64
+
+    Inst_VOP3__V_CMPX_LE_I64::~Inst_VOP3__V_CMPX_LE_I64()
+    {
+    } // ~Inst_VOP3__V_CMPX_LE_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GT_I64 class methods ---
+
+    Inst_VOP3__V_CMPX_GT_I64::Inst_VOP3__V_CMPX_GT_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_gt_i64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GT_I64
+
+    Inst_VOP3__V_CMPX_GT_I64::~Inst_VOP3__V_CMPX_GT_I64()
+    {
+    } // ~Inst_VOP3__V_CMPX_GT_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GT_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NE_I64 class methods ---
+
+    Inst_VOP3__V_CMPX_NE_I64::Inst_VOP3__V_CMPX_NE_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ne_i64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NE_I64
+
+    Inst_VOP3__V_CMPX_NE_I64::~Inst_VOP3__V_CMPX_NE_I64()
+    {
+    } // ~Inst_VOP3__V_CMPX_NE_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GE_I64 class methods ---
+
+    Inst_VOP3__V_CMPX_GE_I64::Inst_VOP3__V_CMPX_GE_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ge_i64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GE_I64
+
+    Inst_VOP3__V_CMPX_GE_I64::~Inst_VOP3__V_CMPX_GE_I64()
+    {
+    } // ~Inst_VOP3__V_CMPX_GE_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_T_I64 class methods ---
+
+    Inst_VOP3__V_CMPX_T_I64::Inst_VOP3__V_CMPX_T_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_t_i64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_T_I64
+
+    Inst_VOP3__V_CMPX_T_I64::~Inst_VOP3__V_CMPX_T_I64()
+    {
+    } // ~Inst_VOP3__V_CMPX_T_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_T_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_F_U64 class methods ---
+
+    Inst_VOP3__V_CMPX_F_U64::Inst_VOP3__V_CMPX_F_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_f_u64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_F_U64
+
+    Inst_VOP3__V_CMPX_F_U64::~Inst_VOP3__V_CMPX_F_U64()
+    {
+    } // ~Inst_VOP3__V_CMPX_F_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_F_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LT_U64 class methods ---
+
+    Inst_VOP3__V_CMPX_LT_U64::Inst_VOP3__V_CMPX_LT_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lt_u64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LT_U64
+
+    Inst_VOP3__V_CMPX_LT_U64::~Inst_VOP3__V_CMPX_LT_U64()
+    {
+    } // ~Inst_VOP3__V_CMPX_LT_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LT_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_EQ_U64 class methods ---
+
+    Inst_VOP3__V_CMPX_EQ_U64::Inst_VOP3__V_CMPX_EQ_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_eq_u64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_EQ_U64
+
+    Inst_VOP3__V_CMPX_EQ_U64::~Inst_VOP3__V_CMPX_EQ_U64()
+    {
+    } // ~Inst_VOP3__V_CMPX_EQ_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LE_U64 class methods ---
+
+    Inst_VOP3__V_CMPX_LE_U64::Inst_VOP3__V_CMPX_LE_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_le_u64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LE_U64
+
+    Inst_VOP3__V_CMPX_LE_U64::~Inst_VOP3__V_CMPX_LE_U64()
+    {
+    } // ~Inst_VOP3__V_CMPX_LE_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GT_U64 class methods ---
+
+    Inst_VOP3__V_CMPX_GT_U64::Inst_VOP3__V_CMPX_GT_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_gt_u64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GT_U64
+
+    Inst_VOP3__V_CMPX_GT_U64::~Inst_VOP3__V_CMPX_GT_U64()
+    {
+    } // ~Inst_VOP3__V_CMPX_GT_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GT_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NE_U64 class methods ---
+
+    Inst_VOP3__V_CMPX_NE_U64::Inst_VOP3__V_CMPX_NE_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ne_u64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NE_U64
+
+    Inst_VOP3__V_CMPX_NE_U64::~Inst_VOP3__V_CMPX_NE_U64()
+    {
+    } // ~Inst_VOP3__V_CMPX_NE_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GE_U64 class methods ---
+
+    Inst_VOP3__V_CMPX_GE_U64::Inst_VOP3__V_CMPX_GE_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ge_u64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GE_U64
+
+    Inst_VOP3__V_CMPX_GE_U64::~Inst_VOP3__V_CMPX_GE_U64()
+    {
+    } // ~Inst_VOP3__V_CMPX_GE_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_T_U64 class methods ---
+
+    Inst_VOP3__V_CMPX_T_U64::Inst_VOP3__V_CMPX_T_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_t_u64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_T_U64
+
+    Inst_VOP3__V_CMPX_T_U64::~Inst_VOP3__V_CMPX_T_U64()
+    {
+    } // ~Inst_VOP3__V_CMPX_T_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_T_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/vop3p.cc b/src/arch/amdgpu/vega/insts/vop3p.cc
index eddb1e7ad5..85f0af2a51 100644
--- a/src/arch/amdgpu/vega/insts/vop3p.cc
+++ b/src/arch/amdgpu/vega/insts/vop3p.cc
@@ -31,6 +31,7 @@
 
 #include "arch/amdgpu/vega/insts/vop3p.hh"
 
+#include "arch/amdgpu/vega/insts/instructions.hh"
 #include "arch/arm/insts/fplib.hh"
 
 namespace gem5
@@ -631,5 +632,236 @@ void Inst_VOP3P__V_ACCVGPR_WRITE::execute(GPUDynInstPtr gpuDynInst)
     vdst.write();
 }
 
+// --- Inst_VOP3P__V_PK_FMA_F32 class methods ---
+
+Inst_VOP3P__V_PK_FMA_F32::Inst_VOP3P__V_PK_FMA_F32(InFmt_VOP3P *iFmt)
+    : Inst_VOP3P(iFmt, "v_pk_fma_f32")
+{
+    setFlag(ALU);
+} // Inst_VOP3P__V_PK_FMA_F32
+
+Inst_VOP3P__V_PK_FMA_F32::~Inst_VOP3P__V_PK_FMA_F32()
+{
+} // ~Inst_VOP3P__V_PK_FMA_F32
+
+// D.f[63:32] = S0.f[63:32] * S1.f[63:32] + S2.f[63:32] . D.f[31:0] =
+//     S0.f[31:0] * S1.f[31:0] + S2.f[31:0] .
+void
+Inst_VOP3P__V_PK_FMA_F32::execute(GPUDynInstPtr gpuDynInst)
+{
+    // This is a special case of packed instructions which operates on
+    // 64-bit inputs/outputs and not 32-bit. U64 is used here as float
+    // values cannot use bitwise operations. Consider the U64 to imply
+    // untyped 64-bits of data.
+    Wavefront *wf = gpuDynInst->wavefront();
+    ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+    ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+    ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
+    VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+    src0.readSrc();
+    src1.readSrc();
+    src2.readSrc();
+
+    int opsel = instData.OPSEL;
+    int opsel_hi = extData.OPSEL_HI | (instData.OPSEL_HI2 << 2);
+
+    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+        if (wf->execMask(lane)) {
+            uint32_t s0l = (opsel & 1) ? bits(src0[lane], 63, 32)
+                                       : bits(src0[lane], 31, 0);
+            uint32_t s1l = (opsel & 2) ? bits(src1[lane], 63, 32)
+                                       : bits(src1[lane], 31, 0);
+            uint32_t s2l = (opsel & 4) ? bits(src2[lane], 63, 32)
+                                       : bits(src2[lane], 31, 0);
+
+            float dword1 = std::fma(*reinterpret_cast<float*>(&s0l),
+                                    *reinterpret_cast<float*>(&s1l),
+                                    *reinterpret_cast<float*>(&s2l));
+
+            uint32_t s0h = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
+                                          : bits(src0[lane], 31, 0);
+            uint32_t s1h = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
+                                          : bits(src1[lane], 31, 0);
+            uint32_t s2h = (opsel_hi & 4) ? bits(src2[lane], 63, 32)
+                                          : bits(src2[lane], 31, 0);
+
+            float dword2 = std::fma(*reinterpret_cast<float*>(&s0h),
+                                    *reinterpret_cast<float*>(&s1h),
+                                    *reinterpret_cast<float*>(&s2h));
+
+            uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
+            uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
+
+            vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;
+        }
+    }
+
+    vdst.write();
+} // execute
+// --- Inst_VOP3P__V_PK_MUL_F32 class methods ---
+
+Inst_VOP3P__V_PK_MUL_F32::Inst_VOP3P__V_PK_MUL_F32(InFmt_VOP3P *iFmt)
+    : Inst_VOP3P(iFmt, "v_pk_mul_f32")
+{
+    setFlag(ALU);
+} // Inst_VOP3P__V_PK_MUL_F32
+
+Inst_VOP3P__V_PK_MUL_F32::~Inst_VOP3P__V_PK_MUL_F32()
+{
+} // ~Inst_VOP3P__V_PK_MUL_F32
+
+// D.f[63:32] = S0.f[63:32] * S1.f[63:32] . D.f[31:0] = S0.f[31:0] *
+//              S1.f[31:0]
+void
+Inst_VOP3P__V_PK_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
+{
+    // This is a special case of packed instructions which operates on
+    // 64-bit inputs/outputs and not 32-bit. U64 is used here as float
+    // values cannot use bitwise operations. Consider the U64 to imply
+    // untyped 64-bits of data.
+    Wavefront *wf = gpuDynInst->wavefront();
+    ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+    ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+    VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+    src0.readSrc();
+    src1.readSrc();
+
+    int opsel = instData.OPSEL;
+    int opsel_hi = extData.OPSEL_HI;
+
+    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+        if (wf->execMask(lane)) {
+            uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
+                                               : bits(src0[lane], 31, 0);
+            uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
+                                               : bits(src1[lane], 31, 0);
+
+            float dword1 = *reinterpret_cast<float*>(&lower_dword)
+                         * *reinterpret_cast<float*>(&upper_dword);
+
+            lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
+                                         : bits(src0[lane], 31, 0);
+            upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
+                                         : bits(src1[lane], 31, 0);
+
+            float dword2 = *reinterpret_cast<float*>(&lower_dword)
+                         * *reinterpret_cast<float*>(&upper_dword);
+
+            uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
+            uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
+
+            vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;
+        }
+    }
+
+    vdst.write();
+} // execute
+// --- Inst_VOP3P__V_PK_ADD_F32 class methods ---
+
+Inst_VOP3P__V_PK_ADD_F32::Inst_VOP3P__V_PK_ADD_F32(InFmt_VOP3P *iFmt)
+    : Inst_VOP3P(iFmt, "v_pk_add_f32")
+{
+    setFlag(ALU);
+} // Inst_VOP3P__V_PK_ADD_F32
+
+Inst_VOP3P__V_PK_ADD_F32::~Inst_VOP3P__V_PK_ADD_F32()
+{
+} // ~Inst_VOP3P__V_PK_ADD_F32
+
+// D.f[63:32] = S0.f[63:32] + S1.f[63:32] . D.f[31:0] = S0.f[31:0] +
+//              S1.f[31:0]
+void
+Inst_VOP3P__V_PK_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
+{
+    // This is a special case of packed instructions which operates on
+    // 64-bit inputs/outputs and not 32-bit. U64 is used here as float
+    // values cannot use bitwise operations. Consider the U64 to imply
+    // untyped 64-bits of data.
+    Wavefront *wf = gpuDynInst->wavefront();
+    ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+    ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+    VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+    src0.readSrc();
+    src1.readSrc();
+
+    int opsel = instData.OPSEL;
+    int opsel_hi = extData.OPSEL_HI;
+
+    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+        if (wf->execMask(lane)) {
+            uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
+                                               : bits(src0[lane], 31, 0);
+            uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
+                                               : bits(src1[lane], 31, 0);
+
+            float dword1 = *reinterpret_cast<float*>(&lower_dword)
+                         + *reinterpret_cast<float*>(&upper_dword);
+
+            lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
+                                         : bits(src0[lane], 31, 0);
+            upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
+                                         : bits(src1[lane], 31, 0);
+
+            float dword2 = *reinterpret_cast<float*>(&lower_dword)
+                         + *reinterpret_cast<float*>(&upper_dword);
+
+            uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
+            uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
+
+            vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;
+        }
+    }
+
+    vdst.write();
+} // execute
+// --- Inst_VOP3P__V_PK_MOV_B32 class methods ---
+
+Inst_VOP3P__V_PK_MOV_B32::Inst_VOP3P__V_PK_MOV_B32(InFmt_VOP3P *iFmt)
+    : Inst_VOP3P(iFmt, "v_pk_mov_b32")
+{
+    setFlag(ALU);
+} // Inst_VOP3P__V_PK_MOV_B32
+
+Inst_VOP3P__V_PK_MOV_B32::~Inst_VOP3P__V_PK_MOV_B32()
+{
+} // ~Inst_VOP3P__V_PK_MOV_B32
+
+// D.u[63:32] = S1.u[31:0]; D.u[31:0] = S0.u[31:0].
+void
+Inst_VOP3P__V_PK_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
+{
+    // This is a special case of packed instructions which operates on
+    // 64-bit inputs/outputs and not 32-bit.
+    Wavefront *wf = gpuDynInst->wavefront();
+    ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+    ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+    VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+    src0.readSrc();
+    src1.readSrc();
+
+    // Only OPSEL[1:0] are used
+    // OPSEL[0] 0/1: Lower dest dword = lower/upper dword of src0
+
+    int opsel = instData.OPSEL;
+
+    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+        if (wf->execMask(lane)) {
+            // OPSEL[1] 0/1: Lower dest dword = lower/upper dword of src1
+            uint64_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
+                                               : bits(src0[lane], 31, 0);
+            uint64_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
+                                               : bits(src1[lane], 31, 0);
+
+            vdst[lane] = upper_dword << 32 | lower_dword;
+        }
+    }
+
+    vdst.write();
+} // execute
+
 } // namespace VegaISA
 } // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/vop3p_mai.cc b/src/arch/amdgpu/vega/insts/vop3p_mai.cc
new file mode 100644
index 0000000000..943aa72cfd
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/vop3p_mai.cc
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+#include "arch/amdgpu/vega/insts/vop3p.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8 class methods ---
+
+    Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8::
+        Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8(InFmt_VOP3P_MAI *iFmt)
+        : Inst_VOP3P_MAI(iFmt, "v_mfma_i32_16x16x16i8")
+    {
+        setFlag(ALU);
+    } // Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8
+
+    Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8::
+        ~Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8()
+    {
+    } // ~Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8
+
+    // D(16x16I32) = A(16x16I8) x B(16x16I8) + C(16x16I32), 1 Blocks, 8
+    // pass, srcA/srcB 1 archVgpr, srcC/D 4 accVGPR
+    void
+    Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        int acc_offset = 0;
+        if (instData.ACC_CD) {
+            warn("ACC_CD not yet implemented\n");
+        }
+
+        // int8 size allows for 4 elements per lane. At 16x16 this means 4
+        // lanes per column (A matrix) / (B matrix). This whole matrix fits
+        // in one VGPR. The C matrix with size int32 requires 4 VGPRs.
+        // Handle the C matrix by using a delta. This is set to 1 normally to
+        // move to the next VGPR (1 dword away) and 0 if the input is a scalar
+        // reg (e.g., a constant).
+        int delta = isVectorReg(extData.SRC2) ? 1 : 0;
+
+        // VecOperandI8 will read 8 bits and sign extend, so used U32 to read
+        // as "untyped" 32-bit values.
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandI32 src2a(gpuDynInst, extData.SRC2+acc_offset);
+        ConstVecOperandI32 src2b(gpuDynInst, extData.SRC2+acc_offset+1*delta);
+        ConstVecOperandI32 src2c(gpuDynInst, extData.SRC2+acc_offset+2*delta);
+        ConstVecOperandI32 src2d(gpuDynInst, extData.SRC2+acc_offset+3*delta);
+
+        VecOperandI32 vdsta(gpuDynInst, instData.VDST+acc_offset);
+        VecOperandI32 vdstb(gpuDynInst, instData.VDST+acc_offset+1);
+        VecOperandI32 vdstc(gpuDynInst, instData.VDST+acc_offset+2);
+        VecOperandI32 vdstd(gpuDynInst, instData.VDST+acc_offset+3);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2a.readSrc();
+        src2b.readSrc();
+        src2c.readSrc();
+        src2d.readSrc();
+
+        int32_t A[16][16];
+        for (int i = 0; i < 64; ++i) {
+            // src0[0:15] contains columns 1 - 4 packed for rows 0 - 15,
+            // src0[16:31] contains columns 5 - 8 packed for rows 0 - 15,
+            // src0[32:47] contains columns 9 - 12 packed for rows 0 - 15,
+            // src0[48:63] contains columns 13 - 16 packed for rows 0 - 15,
+            int row = i % 16;
+            int start_col = (i / 16) * 4;
+
+            A[row][start_col+0] = sext<8>(bits(src0[i], 7, 0));
+            A[row][start_col+1] = sext<8>(bits(src0[i], 15, 8));
+            A[row][start_col+2] = sext<8>(bits(src0[i], 23, 16));
+            A[row][start_col+3] = sext<8>(bits(src0[i], 31, 24));
+        }
+
+        int32_t B[16][16];
+        for (int i = 0; i < 64; ++i) {
+            // src1[0:15] contains rows 1 - 4 packed for columns 0 - 15
+            // src1[16:31] contains rows 5 - 8 packed for columns 0 - 15
+            // src1[32:47] contains rows 9 - 12 packed for columns 0 - 15
+            // src1[48:63] contains rows 13 - 16 packed for columns 0 - 15
+            int start_row = (i / 16) * 4;
+            int col = i % 16;
+
+            B[start_row+0][col] = sext<8>(bits(src1[i], 7, 0));
+            B[start_row+1][col] = sext<8>(bits(src1[i], 15, 8));
+            B[start_row+2][col] = sext<8>(bits(src1[i], 23, 16));
+            B[start_row+3][col] = sext<8>(bits(src1[i], 31, 24));
+        }
+
+        int32_t result[16][16];
+
+        // Load accumulation matrix C into result
+        for (int i = 0; i < 64; ++i) {
+            // src2a contains rows 0, 4, 8, 12
+            result[(i/16)*4][(i%16)] = src2a[i];
+            // src2b contains rows 1, 5, 9, 13
+            result[(i/16)*4+1][(i%16)] = src2b[i];
+            // src2c contains rows 2, 6, 10, 14
+            result[(i/16)*4+2][(i%16)] = src2c[i];
+            // src2d contains rows 3, 7, 11, 15
+            result[(i/16)*4+3][(i%16)] = src2d[i];
+        }
+
+        // Compute new result - This is (obviously) not optimized
+        for (int i = 0; i < 16; ++i) {
+            for (int j = 0; j < 16; ++j) {
+                for (int k = 0; k < 16; ++k) {
+                    result[i][j] += A[i][k] * B[k][j];
+                }
+            }
+        }
+
+        // Put result in dest VGPRs
+        for (int i = 0; i < 64; ++i) {
+            // vdsta contains rows 0, 4, 8, 12
+            vdsta[i] = result[(i/16)*4][(i%16)];
+            // vdstb contains rows 1, 5, 9, 13
+            vdstb[i] = result[(i/16)*4+1][(i%16)];
+            // vdstc contains rows 2, 6, 10, 14
+            vdstc[i] = result[(i/16)*4+2][(i%16)];
+            // vdstd contains rows 3, 7, 11, 15
+            vdstd[i] = result[(i/16)*4+3][(i%16)];
+        }
+
+        vdsta.write();
+        vdstb.write();
+        vdstc.write();
+        vdstd.write();
+    } // execute
+    // --- Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64 class methods ---
+
+    Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64::
+        Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64(InFmt_VOP3P_MAI *iFmt)
+        : Inst_VOP3P_MAI(iFmt, "v_mfma_f64_16x16x4f64")
+    {
+        setFlag(ALU);
+    } // Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64
+
+    Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64::
+        ~Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64()
+    {
+    } // ~Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64
+
+    // D(16x16F64) = A(16x4F64) x B(4x16F64) + C(16x16F64), 1 Blocks, 8
+    // pass, srcA/srcB 2 VGPR, srcC/D 8 VGPR
+    void
+    Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        int acc_offset = 0;
+        if (instData.ACC_CD) {
+            warn("ACC_CD not yet implemented\n");
+        }
+
+        // Handling of src2 is a bit tricky. The operator[] overload cannot
+        // be used for dword count > 2, and the dword count here is 8. Usually
+        // src2 is a VGPR/AccGPR, but it might also be constant. In order to
+        // use operator[] and handle constants, check for VGPR here and set
+        // a delta for each of the pairs of src2 GPRs.
+        int delta = isVectorReg(extData.SRC2) ? 2 : 0;
+
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF64 src2a(gpuDynInst, extData.SRC2+acc_offset);
+        ConstVecOperandF64 src2b(gpuDynInst, extData.SRC2+acc_offset+1*delta);
+        ConstVecOperandF64 src2c(gpuDynInst, extData.SRC2+acc_offset+2*delta);
+        ConstVecOperandF64 src2d(gpuDynInst, extData.SRC2+acc_offset+3*delta);
+
+        VecOperandF64 vdsta(gpuDynInst, instData.VDST+acc_offset);
+        VecOperandF64 vdstb(gpuDynInst, instData.VDST+acc_offset+2);
+        VecOperandF64 vdstc(gpuDynInst, instData.VDST+acc_offset+4);
+        VecOperandF64 vdstd(gpuDynInst, instData.VDST+acc_offset+6);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2a.readSrc();
+        src2b.readSrc();
+        src2c.readSrc();
+        src2d.readSrc();
+
+        double result[16][16];
+
+        // Load src2 into result. src2 is row major
+        for (int i = 0; i < 64; ++i) {
+            // src2a contains rows 0 - 3
+            result[(i/16)][(i%16)] = src2a[i];
+            // src2b contains rows 4 - 7
+            result[(i/16)+4][(i%16)] = src2b[i];
+            // src2c contains rows 8 - 11
+            result[(i/16)+8][(i%16)] = src2c[i];
+            // src2d contains rows 12 - 15
+            result[(i/16)+12][(i%16)] = src2d[i];
+        }
+
+        // Compute new result
+        for (int i = 0; i < 16; ++i) {
+            for (int j = 0; j < 16; ++j) {
+                for (int k = 0; k < 4; ++k) {
+                    // src0 is column major, src1 is row major
+                    int lane_A = 16*k + i;
+                    int lane_B = 16*k + j;
+                    result[i][j] += src0[lane_A] * src1[lane_B];
+                }
+            }
+        }
+
+        // Put result in dest VGPRs
+        for (int i = 0; i < 64; ++i) {
+            // vdsta contains rows 0 - 3
+            vdsta[i] = result[(i/16)][(i%16)];
+            // src2b contains rows 4 - 7
+            vdstb[i] = result[(i/16)+4][(i%16)];
+            // src2c contains rows 8 - 11
+            vdstc[i] = result[(i/16)+8][(i%16)];
+            // src2d contains rows 12 - 15
+            vdstd[i] = result[(i/16)+12][(i%16)];
+        }
+
+        vdsta.write();
+        vdstb.write();
+        vdstc.write();
+        vdstd.write();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/vopc.cc b/src/arch/amdgpu/vega/insts/vopc.cc
new file mode 100644
index 0000000000..2c386fec74
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/vopc.cc
@@ -0,0 +1,6590 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_VOPC__V_CMP_CLASS_F32 class methods ---
+
+    Inst_VOPC__V_CMP_CLASS_F32::Inst_VOPC__V_CMP_CLASS_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_class_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_CLASS_F32
+
+    Inst_VOPC__V_CMP_CLASS_F32::~Inst_VOPC__V_CMP_CLASS_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_CLASS_F32
+
+    // --- description from .arch file ---
+    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f
+    // The function reports true if the floating point value is *any* of the
+    // ---  numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOPC__V_CMP_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
+                    // is NaN
+                    if (std::isnan(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 2)) {
+                    // is -infinity
+                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 3)) {
+                    // is -normal
+                    if (std::isnormal(src0[lane])
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 4)) {
+                    // is -denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 5)) {
+                    // is -zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 6)) {
+                    // is +zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 7)) {
+                    // is +denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 8)) {
+                    // is +normal
+                    if (std::isnormal(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 9)) {
+                    // is +infinity
+                    if (std::isinf(src0[lane]) && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_CLASS_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_CLASS_F32::Inst_VOPC__V_CMPX_CLASS_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_class_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_CLASS_F32
+
+    Inst_VOPC__V_CMPX_CLASS_F32::~Inst_VOPC__V_CMPX_CLASS_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_CLASS_F32
+
+    // --- description from .arch file ---
+    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
+    // S0.f The function reports true if the floating point value is *any* of
+    // the numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOPC__V_CMPX_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
+                    // is NaN
+                    if (std::isnan(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 2)) {
+                    // is -infinity
+                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 3)) {
+                    // is -normal
+                    if (std::isnormal(src0[lane])
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 4)) {
+                    // is -denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 5)) {
+                    // is -zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 6)) {
+                    // is +zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 7)) {
+                    // is +denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 8)) {
+                    // is +normal
+                    if (std::isnormal(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 9)) {
+                    // is +infinity
+                    if (std::isinf(src0[lane]) && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMP_CLASS_F64 class methods ---
+
+    Inst_VOPC__V_CMP_CLASS_F64::Inst_VOPC__V_CMP_CLASS_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_class_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_CLASS_F64
+
+    Inst_VOPC__V_CMP_CLASS_F64::~Inst_VOPC__V_CMP_CLASS_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_CLASS_F64
+
+    // --- description from .arch file ---
+    // VCC = IEEE numeric class function specified in S1.u, performed on S0.d
+    // The function reports true if the floating point value is *any* of the
+    // ---  numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOPC__V_CMP_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
+                    // is NaN
+                    if (std::isnan(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 2)) {
+                    // is -infinity
+                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 3)) {
+                    // is -normal
+                    if (std::isnormal(src0[lane])
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 4)) {
+                    // is -denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 5)) {
+                    // is -zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 6)) {
+                    // is +zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 7)) {
+                    // is +denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 8)) {
+                    // is +normal
+                    if (std::isnormal(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 9)) {
+                    // is +infinity
+                    if (std::isinf(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_CLASS_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_CLASS_F64::Inst_VOPC__V_CMPX_CLASS_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_class_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_CLASS_F64
+
+    Inst_VOPC__V_CMPX_CLASS_F64::~Inst_VOPC__V_CMPX_CLASS_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_CLASS_F64
+
+    // --- description from .arch file ---
+    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
+    // S0.d The function reports true if the floating point value is *any* of
+    // the numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOPC__V_CMPX_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
+                    // is NaN
+                    if (std::isnan(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 2)) {
+                    // is -infinity
+                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 3)) {
+                    // is -normal
+                    if (std::isnormal(src0[lane])
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 4)) {
+                    // is -denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 5)) {
+                    // is -zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 6)) {
+                    // is +zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 7)) {
+                    // is +denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 8)) {
+                    // is +normal
+                    if (std::isnormal(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 9)) {
+                    // is +infinity
+                    if (std::isinf(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMP_CLASS_F16 class methods ---
+
+    Inst_VOPC__V_CMP_CLASS_F16::Inst_VOPC__V_CMP_CLASS_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_class_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_CLASS_F16
+
+    Inst_VOPC__V_CMP_CLASS_F16::~Inst_VOPC__V_CMP_CLASS_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_CLASS_F16
+
+    // --- description from .arch file ---
+    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f16
+    // The function reports true if the floating point value is *any* of the
+    // ---  numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOPC__V_CMP_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_CLASS_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_CLASS_F16::Inst_VOPC__V_CMPX_CLASS_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_class_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_CLASS_F16
+
+    Inst_VOPC__V_CMPX_CLASS_F16::~Inst_VOPC__V_CMPX_CLASS_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_CLASS_F16
+
+    // --- description from .arch file ---
+    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
+    // ---  S0.f16
+    // The function reports true if the floating point value is *any* of the
+    // ---  numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOPC__V_CMPX_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_F_F16 class methods ---
+
+    Inst_VOPC__V_CMP_F_F16::Inst_VOPC__V_CMP_F_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_f_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_F_F16
+
+    Inst_VOPC__V_CMP_F_F16::~Inst_VOPC__V_CMP_F_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_F_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_F_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LT_F16 class methods ---
+
+    Inst_VOPC__V_CMP_LT_F16::Inst_VOPC__V_CMP_LT_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lt_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_LT_F16
+
+    Inst_VOPC__V_CMP_LT_F16::~Inst_VOPC__V_CMP_LT_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_LT_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_EQ_F16 class methods ---
+
+    Inst_VOPC__V_CMP_EQ_F16::Inst_VOPC__V_CMP_EQ_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_eq_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_EQ_F16
+
+    Inst_VOPC__V_CMP_EQ_F16::~Inst_VOPC__V_CMP_EQ_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_EQ_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LE_F16 class methods ---
+
+    Inst_VOPC__V_CMP_LE_F16::Inst_VOPC__V_CMP_LE_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_le_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_LE_F16
+
+    Inst_VOPC__V_CMP_LE_F16::~Inst_VOPC__V_CMP_LE_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_LE_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GT_F16 class methods ---
+
+    Inst_VOPC__V_CMP_GT_F16::Inst_VOPC__V_CMP_GT_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_gt_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_GT_F16
+
+    Inst_VOPC__V_CMP_GT_F16::~Inst_VOPC__V_CMP_GT_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_GT_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LG_F16 class methods ---
+
+    Inst_VOPC__V_CMP_LG_F16::Inst_VOPC__V_CMP_LG_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lg_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_LG_F16
+
+    Inst_VOPC__V_CMP_LG_F16::~Inst_VOPC__V_CMP_LG_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_LG_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GE_F16 class methods ---
+
+    Inst_VOPC__V_CMP_GE_F16::Inst_VOPC__V_CMP_GE_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ge_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_GE_F16
+
+    Inst_VOPC__V_CMP_GE_F16::~Inst_VOPC__V_CMP_GE_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_GE_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_O_F16 class methods ---
+
+    Inst_VOPC__V_CMP_O_F16::Inst_VOPC__V_CMP_O_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_o_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_O_F16
+
+    Inst_VOPC__V_CMP_O_F16::~Inst_VOPC__V_CMP_O_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_O_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_O_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_U_F16 class methods ---
+
+    Inst_VOPC__V_CMP_U_F16::Inst_VOPC__V_CMP_U_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_u_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_U_F16
+
+    Inst_VOPC__V_CMP_U_F16::~Inst_VOPC__V_CMP_U_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_U_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_U_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NGE_F16 class methods ---
+
+    Inst_VOPC__V_CMP_NGE_F16::Inst_VOPC__V_CMP_NGE_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nge_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_NGE_F16
+
+    Inst_VOPC__V_CMP_NGE_F16::~Inst_VOPC__V_CMP_NGE_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_NGE_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NLG_F16 class methods ---
+
+    Inst_VOPC__V_CMP_NLG_F16::Inst_VOPC__V_CMP_NLG_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nlg_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_NLG_F16
+
+    Inst_VOPC__V_CMP_NLG_F16::~Inst_VOPC__V_CMP_NLG_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_NLG_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NGT_F16 class methods ---
+
+    Inst_VOPC__V_CMP_NGT_F16::Inst_VOPC__V_CMP_NGT_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ngt_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_NGT_F16
+
+    Inst_VOPC__V_CMP_NGT_F16::~Inst_VOPC__V_CMP_NGT_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_NGT_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NLE_F16 class methods ---
+
+    Inst_VOPC__V_CMP_NLE_F16::Inst_VOPC__V_CMP_NLE_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nle_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_NLE_F16
+
+    Inst_VOPC__V_CMP_NLE_F16::~Inst_VOPC__V_CMP_NLE_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_NLE_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NEQ_F16 class methods ---
+
+    Inst_VOPC__V_CMP_NEQ_F16::Inst_VOPC__V_CMP_NEQ_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_neq_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_NEQ_F16
+
+    Inst_VOPC__V_CMP_NEQ_F16::~Inst_VOPC__V_CMP_NEQ_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_NEQ_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NLT_F16 class methods ---
+
+    Inst_VOPC__V_CMP_NLT_F16::Inst_VOPC__V_CMP_NLT_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nlt_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_NLT_F16
+
+    Inst_VOPC__V_CMP_NLT_F16::~Inst_VOPC__V_CMP_NLT_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_NLT_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_TRU_F16 class methods ---
+
+    Inst_VOPC__V_CMP_TRU_F16::Inst_VOPC__V_CMP_TRU_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_tru_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_TRU_F16
+
+    Inst_VOPC__V_CMP_TRU_F16::~Inst_VOPC__V_CMP_TRU_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_TRU_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_F_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_F_F16::Inst_VOPC__V_CMPX_F_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_f_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_F_F16
+
+    Inst_VOPC__V_CMPX_F_F16::~Inst_VOPC__V_CMPX_F_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_F_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_F_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LT_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_LT_F16::Inst_VOPC__V_CMPX_LT_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lt_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LT_F16
+
+    Inst_VOPC__V_CMPX_LT_F16::~Inst_VOPC__V_CMPX_LT_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_LT_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_EQ_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_EQ_F16::Inst_VOPC__V_CMPX_EQ_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_eq_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_EQ_F16
+
+    Inst_VOPC__V_CMPX_EQ_F16::~Inst_VOPC__V_CMPX_EQ_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_EQ_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LE_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_LE_F16::Inst_VOPC__V_CMPX_LE_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_le_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LE_F16
+
+    Inst_VOPC__V_CMPX_LE_F16::~Inst_VOPC__V_CMPX_LE_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_LE_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GT_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_GT_F16::Inst_VOPC__V_CMPX_GT_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_gt_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GT_F16
+
+    Inst_VOPC__V_CMPX_GT_F16::~Inst_VOPC__V_CMPX_GT_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_GT_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LG_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_LG_F16::Inst_VOPC__V_CMPX_LG_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lg_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LG_F16
+
+    Inst_VOPC__V_CMPX_LG_F16::~Inst_VOPC__V_CMPX_LG_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_LG_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GE_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_GE_F16::Inst_VOPC__V_CMPX_GE_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ge_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GE_F16
+
+    Inst_VOPC__V_CMPX_GE_F16::~Inst_VOPC__V_CMPX_GE_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_GE_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_O_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_O_F16::Inst_VOPC__V_CMPX_O_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_o_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_O_F16
+
+    Inst_VOPC__V_CMPX_O_F16::~Inst_VOPC__V_CMPX_O_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_O_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOPC__V_CMPX_O_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_U_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_U_F16::Inst_VOPC__V_CMPX_U_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_u_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_U_F16
+
+    Inst_VOPC__V_CMPX_U_F16::~Inst_VOPC__V_CMPX_U_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_U_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOPC__V_CMPX_U_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NGE_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_NGE_F16::Inst_VOPC__V_CMPX_NGE_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nge_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NGE_F16
+
+    Inst_VOPC__V_CMPX_NGE_F16::~Inst_VOPC__V_CMPX_NGE_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_NGE_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NLG_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_NLG_F16::Inst_VOPC__V_CMPX_NLG_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nlg_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NLG_F16
+
+    Inst_VOPC__V_CMPX_NLG_F16::~Inst_VOPC__V_CMPX_NLG_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_NLG_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NGT_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_NGT_F16::Inst_VOPC__V_CMPX_NGT_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ngt_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NGT_F16
+
+    Inst_VOPC__V_CMPX_NGT_F16::~Inst_VOPC__V_CMPX_NGT_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_NGT_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NLE_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_NLE_F16::Inst_VOPC__V_CMPX_NLE_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nle_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NLE_F16
+
+    Inst_VOPC__V_CMPX_NLE_F16::~Inst_VOPC__V_CMPX_NLE_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_NLE_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NEQ_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_NEQ_F16::Inst_VOPC__V_CMPX_NEQ_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_neq_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NEQ_F16
+
+    Inst_VOPC__V_CMPX_NEQ_F16::~Inst_VOPC__V_CMPX_NEQ_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_NEQ_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NLT_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_NLT_F16::Inst_VOPC__V_CMPX_NLT_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nlt_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NLT_F16
+
+    Inst_VOPC__V_CMPX_NLT_F16::~Inst_VOPC__V_CMPX_NLT_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_NLT_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_TRU_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_TRU_F16::Inst_VOPC__V_CMPX_TRU_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_tru_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_TRU_F16
+
+    Inst_VOPC__V_CMPX_TRU_F16::~Inst_VOPC__V_CMPX_TRU_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_TRU_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_F_F32 class methods ---
+
+    Inst_VOPC__V_CMP_F_F32::Inst_VOPC__V_CMP_F_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_f_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_F_F32
+
+    Inst_VOPC__V_CMP_F_F32::~Inst_VOPC__V_CMP_F_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_F_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_F_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LT_F32 class methods ---
+
+    Inst_VOPC__V_CMP_LT_F32::Inst_VOPC__V_CMP_LT_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lt_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_LT_F32
+
+    Inst_VOPC__V_CMP_LT_F32::~Inst_VOPC__V_CMP_LT_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_LT_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_EQ_F32 class methods ---
+
+    Inst_VOPC__V_CMP_EQ_F32::Inst_VOPC__V_CMP_EQ_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_eq_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_EQ_F32
+
+    Inst_VOPC__V_CMP_EQ_F32::~Inst_VOPC__V_CMP_EQ_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_EQ_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LE_F32 class methods ---
+
+    Inst_VOPC__V_CMP_LE_F32::Inst_VOPC__V_CMP_LE_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_le_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_LE_F32
+
+    Inst_VOPC__V_CMP_LE_F32::~Inst_VOPC__V_CMP_LE_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_LE_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GT_F32 class methods ---
+
+    Inst_VOPC__V_CMP_GT_F32::Inst_VOPC__V_CMP_GT_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_gt_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_GT_F32
+
+    Inst_VOPC__V_CMP_GT_F32::~Inst_VOPC__V_CMP_GT_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_GT_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LG_F32 class methods ---
+
+    Inst_VOPC__V_CMP_LG_F32::Inst_VOPC__V_CMP_LG_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lg_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_LG_F32
+
+    Inst_VOPC__V_CMP_LG_F32::~Inst_VOPC__V_CMP_LG_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_LG_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GE_F32 class methods ---
+
+    Inst_VOPC__V_CMP_GE_F32::Inst_VOPC__V_CMP_GE_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ge_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_GE_F32
+
+    Inst_VOPC__V_CMP_GE_F32::~Inst_VOPC__V_CMP_GE_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_GE_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_O_F32 class methods ---
+
+    Inst_VOPC__V_CMP_O_F32::Inst_VOPC__V_CMP_O_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_o_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_O_F32
+
+    Inst_VOPC__V_CMP_O_F32::~Inst_VOPC__V_CMP_O_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_O_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_O_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (!std::isnan(src0[lane])
+                    && !std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_U_F32 class methods ---
+
+    Inst_VOPC__V_CMP_U_F32::Inst_VOPC__V_CMP_U_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_u_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_U_F32
+
+    Inst_VOPC__V_CMP_U_F32::~Inst_VOPC__V_CMP_U_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_U_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_U_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (std::isnan(src0[lane])
+                    || std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NGE_F32 class methods ---
+
+    Inst_VOPC__V_CMP_NGE_F32::Inst_VOPC__V_CMP_NGE_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nge_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_NGE_F32
+
+    Inst_VOPC__V_CMP_NGE_F32::~Inst_VOPC__V_CMP_NGE_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_NGE_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NLG_F32 class methods ---
+
+    Inst_VOPC__V_CMP_NLG_F32::Inst_VOPC__V_CMP_NLG_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nlg_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_NLG_F32
+
+    Inst_VOPC__V_CMP_NLG_F32::~Inst_VOPC__V_CMP_NLG_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_NLG_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NGT_F32 class methods ---
+
+    Inst_VOPC__V_CMP_NGT_F32::Inst_VOPC__V_CMP_NGT_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ngt_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_NGT_F32
+
+    Inst_VOPC__V_CMP_NGT_F32::~Inst_VOPC__V_CMP_NGT_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_NGT_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NLE_F32 class methods ---
+
+    Inst_VOPC__V_CMP_NLE_F32::Inst_VOPC__V_CMP_NLE_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nle_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_NLE_F32
+
+    Inst_VOPC__V_CMP_NLE_F32::~Inst_VOPC__V_CMP_NLE_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_NLE_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NEQ_F32 class methods ---
+
+    Inst_VOPC__V_CMP_NEQ_F32::Inst_VOPC__V_CMP_NEQ_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_neq_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_NEQ_F32
+
+    Inst_VOPC__V_CMP_NEQ_F32::~Inst_VOPC__V_CMP_NEQ_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_NEQ_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NLT_F32 class methods ---
+
+    Inst_VOPC__V_CMP_NLT_F32::Inst_VOPC__V_CMP_NLT_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nlt_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_NLT_F32
+
+    Inst_VOPC__V_CMP_NLT_F32::~Inst_VOPC__V_CMP_NLT_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_NLT_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_TRU_F32 class methods ---
+
+    Inst_VOPC__V_CMP_TRU_F32::Inst_VOPC__V_CMP_TRU_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_tru_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_TRU_F32
+
+    Inst_VOPC__V_CMP_TRU_F32::~Inst_VOPC__V_CMP_TRU_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_TRU_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_F_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_F_F32::Inst_VOPC__V_CMPX_F_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_f_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_F_F32
+
+    Inst_VOPC__V_CMPX_F_F32::~Inst_VOPC__V_CMPX_F_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_F_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_F_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LT_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_LT_F32::Inst_VOPC__V_CMPX_LT_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lt_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LT_F32
+
+    Inst_VOPC__V_CMPX_LT_F32::~Inst_VOPC__V_CMPX_LT_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_LT_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_EQ_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_EQ_F32::Inst_VOPC__V_CMPX_EQ_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_eq_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_EQ_F32
+
+    Inst_VOPC__V_CMPX_EQ_F32::~Inst_VOPC__V_CMPX_EQ_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_EQ_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LE_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_LE_F32::Inst_VOPC__V_CMPX_LE_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_le_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LE_F32
+
+    Inst_VOPC__V_CMPX_LE_F32::~Inst_VOPC__V_CMPX_LE_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_LE_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GT_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_GT_F32::Inst_VOPC__V_CMPX_GT_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_gt_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GT_F32
+
+    Inst_VOPC__V_CMPX_GT_F32::~Inst_VOPC__V_CMPX_GT_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_GT_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LG_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_LG_F32::Inst_VOPC__V_CMPX_LG_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lg_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LG_F32
+
+    Inst_VOPC__V_CMPX_LG_F32::~Inst_VOPC__V_CMPX_LG_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_LG_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GE_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_GE_F32::Inst_VOPC__V_CMPX_GE_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ge_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GE_F32
+
+    Inst_VOPC__V_CMPX_GE_F32::~Inst_VOPC__V_CMPX_GE_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_GE_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_O_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_O_F32::Inst_VOPC__V_CMPX_O_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_o_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_O_F32
+
+    Inst_VOPC__V_CMPX_O_F32::~Inst_VOPC__V_CMPX_O_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_O_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOPC__V_CMPX_O_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (!std::isnan(src0[lane])
+                    && !std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_U_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_U_F32::Inst_VOPC__V_CMPX_U_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_u_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_U_F32
+
+    Inst_VOPC__V_CMPX_U_F32::~Inst_VOPC__V_CMPX_U_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_U_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOPC__V_CMPX_U_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (std::isnan(src0[lane])
+                    || std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NGE_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_NGE_F32::Inst_VOPC__V_CMPX_NGE_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nge_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NGE_F32
+
+    Inst_VOPC__V_CMPX_NGE_F32::~Inst_VOPC__V_CMPX_NGE_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_NGE_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NLG_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_NLG_F32::Inst_VOPC__V_CMPX_NLG_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nlg_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NLG_F32
+
+    Inst_VOPC__V_CMPX_NLG_F32::~Inst_VOPC__V_CMPX_NLG_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_NLG_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NGT_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_NGT_F32::Inst_VOPC__V_CMPX_NGT_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ngt_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NGT_F32
+
+    Inst_VOPC__V_CMPX_NGT_F32::~Inst_VOPC__V_CMPX_NGT_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_NGT_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NLE_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_NLE_F32::Inst_VOPC__V_CMPX_NLE_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nle_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NLE_F32
+
+    Inst_VOPC__V_CMPX_NLE_F32::~Inst_VOPC__V_CMPX_NLE_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_NLE_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NEQ_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_NEQ_F32::Inst_VOPC__V_CMPX_NEQ_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_neq_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NEQ_F32
+
+    Inst_VOPC__V_CMPX_NEQ_F32::~Inst_VOPC__V_CMPX_NEQ_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_NEQ_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] == src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NLT_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_NLT_F32::Inst_VOPC__V_CMPX_NLT_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nlt_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NLT_F32
+
+    Inst_VOPC__V_CMPX_NLT_F32::~Inst_VOPC__V_CMPX_NLT_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_NLT_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_TRU_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_TRU_F32::Inst_VOPC__V_CMPX_TRU_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_tru_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_TRU_F32
+
+    Inst_VOPC__V_CMPX_TRU_F32::~Inst_VOPC__V_CMPX_TRU_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_TRU_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMP_F_F64 class methods ---
+
+    Inst_VOPC__V_CMP_F_F64::Inst_VOPC__V_CMP_F_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_f_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_F_F64
+
+    Inst_VOPC__V_CMP_F_F64::~Inst_VOPC__V_CMP_F_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_F_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_F_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LT_F64 class methods ---
+
+    Inst_VOPC__V_CMP_LT_F64::Inst_VOPC__V_CMP_LT_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lt_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_LT_F64
+
+    Inst_VOPC__V_CMP_LT_F64::~Inst_VOPC__V_CMP_LT_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_LT_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_EQ_F64 class methods ---
+
+    Inst_VOPC__V_CMP_EQ_F64::Inst_VOPC__V_CMP_EQ_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_eq_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_EQ_F64
+
+    Inst_VOPC__V_CMP_EQ_F64::~Inst_VOPC__V_CMP_EQ_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_EQ_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LE_F64 class methods ---
+
+    Inst_VOPC__V_CMP_LE_F64::Inst_VOPC__V_CMP_LE_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_le_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_LE_F64
+
+    Inst_VOPC__V_CMP_LE_F64::~Inst_VOPC__V_CMP_LE_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_LE_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GT_F64 class methods ---
+
+    Inst_VOPC__V_CMP_GT_F64::Inst_VOPC__V_CMP_GT_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_gt_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_GT_F64
+
+    Inst_VOPC__V_CMP_GT_F64::~Inst_VOPC__V_CMP_GT_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_GT_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LG_F64 class methods ---
+
+    Inst_VOPC__V_CMP_LG_F64::Inst_VOPC__V_CMP_LG_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lg_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_LG_F64
+
+    Inst_VOPC__V_CMP_LG_F64::~Inst_VOPC__V_CMP_LG_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_LG_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LG_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GE_F64 class methods ---
+
+    Inst_VOPC__V_CMP_GE_F64::Inst_VOPC__V_CMP_GE_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ge_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_GE_F64
+
+    Inst_VOPC__V_CMP_GE_F64::~Inst_VOPC__V_CMP_GE_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_GE_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_O_F64 class methods ---
+
+    Inst_VOPC__V_CMP_O_F64::Inst_VOPC__V_CMP_O_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_o_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_O_F64
+
+    Inst_VOPC__V_CMP_O_F64::~Inst_VOPC__V_CMP_O_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_O_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_O_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (!std::isnan(src0[lane])
+                    && !std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_U_F64 class methods ---
+
+    Inst_VOPC__V_CMP_U_F64::Inst_VOPC__V_CMP_U_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_u_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_U_F64
+
+    Inst_VOPC__V_CMP_U_F64::~Inst_VOPC__V_CMP_U_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_U_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_U_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (std::isnan(src0[lane])
+                    || std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NGE_F64 class methods ---
+
+    Inst_VOPC__V_CMP_NGE_F64::Inst_VOPC__V_CMP_NGE_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nge_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_NGE_F64
+
+    Inst_VOPC__V_CMP_NGE_F64::~Inst_VOPC__V_CMP_NGE_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_NGE_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NLG_F64 class methods ---
+
+    Inst_VOPC__V_CMP_NLG_F64::Inst_VOPC__V_CMP_NLG_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nlg_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_NLG_F64
+
+    Inst_VOPC__V_CMP_NLG_F64::~Inst_VOPC__V_CMP_NLG_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_NLG_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NGT_F64 class methods ---
+
+    Inst_VOPC__V_CMP_NGT_F64::Inst_VOPC__V_CMP_NGT_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ngt_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_NGT_F64
+
+    Inst_VOPC__V_CMP_NGT_F64::~Inst_VOPC__V_CMP_NGT_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_NGT_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NLE_F64 class methods ---
+
+    Inst_VOPC__V_CMP_NLE_F64::Inst_VOPC__V_CMP_NLE_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nle_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_NLE_F64
+
+    Inst_VOPC__V_CMP_NLE_F64::~Inst_VOPC__V_CMP_NLE_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_NLE_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NEQ_F64 class methods ---
+
+    Inst_VOPC__V_CMP_NEQ_F64::Inst_VOPC__V_CMP_NEQ_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_neq_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_NEQ_F64
+
+    Inst_VOPC__V_CMP_NEQ_F64::~Inst_VOPC__V_CMP_NEQ_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_NEQ_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NLT_F64 class methods ---
+
+    Inst_VOPC__V_CMP_NLT_F64::Inst_VOPC__V_CMP_NLT_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nlt_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_NLT_F64
+
+    Inst_VOPC__V_CMP_NLT_F64::~Inst_VOPC__V_CMP_NLT_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_NLT_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_TRU_F64 class methods ---
+
+    Inst_VOPC__V_CMP_TRU_F64::Inst_VOPC__V_CMP_TRU_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_tru_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_TRU_F64
+
+    Inst_VOPC__V_CMP_TRU_F64::~Inst_VOPC__V_CMP_TRU_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_TRU_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_F_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_F_F64::Inst_VOPC__V_CMPX_F_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_f_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_F_F64
+
+    Inst_VOPC__V_CMPX_F_F64::~Inst_VOPC__V_CMPX_F_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_F_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_F_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LT_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_LT_F64::Inst_VOPC__V_CMPX_LT_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lt_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LT_F64
+
+    Inst_VOPC__V_CMPX_LT_F64::~Inst_VOPC__V_CMPX_LT_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_LT_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_EQ_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_EQ_F64::Inst_VOPC__V_CMPX_EQ_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_eq_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_EQ_F64
+
+    Inst_VOPC__V_CMPX_EQ_F64::~Inst_VOPC__V_CMPX_EQ_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_EQ_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LE_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_LE_F64::Inst_VOPC__V_CMPX_LE_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_le_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LE_F64
+
+    Inst_VOPC__V_CMPX_LE_F64::~Inst_VOPC__V_CMPX_LE_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_LE_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GT_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_GT_F64::Inst_VOPC__V_CMPX_GT_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_gt_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GT_F64
+
+    Inst_VOPC__V_CMPX_GT_F64::~Inst_VOPC__V_CMPX_GT_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_GT_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LG_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_LG_F64::Inst_VOPC__V_CMPX_LG_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lg_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LG_F64
+
+    Inst_VOPC__V_CMPX_LG_F64::~Inst_VOPC__V_CMPX_LG_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_LG_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LG_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GE_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_GE_F64::Inst_VOPC__V_CMPX_GE_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ge_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GE_F64
+
+    Inst_VOPC__V_CMPX_GE_F64::~Inst_VOPC__V_CMPX_GE_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_GE_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_O_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_O_F64::Inst_VOPC__V_CMPX_O_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_o_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_O_F64
+
+    Inst_VOPC__V_CMPX_O_F64::~Inst_VOPC__V_CMPX_O_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_O_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOPC__V_CMPX_O_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (!std::isnan(src0[lane])
+                    && !std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_U_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_U_F64::Inst_VOPC__V_CMPX_U_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_u_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_U_F64
+
+    Inst_VOPC__V_CMPX_U_F64::~Inst_VOPC__V_CMPX_U_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_U_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOPC__V_CMPX_U_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (std::isnan(src0[lane])
+                    || std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NGE_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_NGE_F64::Inst_VOPC__V_CMPX_NGE_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nge_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NGE_F64
+
+    Inst_VOPC__V_CMPX_NGE_F64::~Inst_VOPC__V_CMPX_NGE_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_NGE_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NLG_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_NLG_F64::Inst_VOPC__V_CMPX_NLG_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nlg_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NLG_F64
+
+    Inst_VOPC__V_CMPX_NLG_F64::~Inst_VOPC__V_CMPX_NLG_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_NLG_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NGT_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_NGT_F64::Inst_VOPC__V_CMPX_NGT_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ngt_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NGT_F64
+
+    Inst_VOPC__V_CMPX_NGT_F64::~Inst_VOPC__V_CMPX_NGT_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_NGT_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NLE_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_NLE_F64::Inst_VOPC__V_CMPX_NLE_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nle_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NLE_F64
+
+    Inst_VOPC__V_CMPX_NLE_F64::~Inst_VOPC__V_CMPX_NLE_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_NLE_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NEQ_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_NEQ_F64::Inst_VOPC__V_CMPX_NEQ_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_neq_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NEQ_F64
+
+    Inst_VOPC__V_CMPX_NEQ_F64::~Inst_VOPC__V_CMPX_NEQ_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_NEQ_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NLT_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_NLT_F64::Inst_VOPC__V_CMPX_NLT_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nlt_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NLT_F64
+
+    Inst_VOPC__V_CMPX_NLT_F64::~Inst_VOPC__V_CMPX_NLT_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_NLT_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_TRU_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_TRU_F64::Inst_VOPC__V_CMPX_TRU_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_tru_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_TRU_F64
+
+    Inst_VOPC__V_CMPX_TRU_F64::~Inst_VOPC__V_CMPX_TRU_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_TRU_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_F_I16 class methods ---
+
+    Inst_VOPC__V_CMP_F_I16::Inst_VOPC__V_CMP_F_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_f_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_F_I16
+
+    Inst_VOPC__V_CMP_F_I16::~Inst_VOPC__V_CMP_F_I16()
+    {
+    } // ~Inst_VOPC__V_CMP_F_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_F_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LT_I16 class methods ---
+
+    Inst_VOPC__V_CMP_LT_I16::Inst_VOPC__V_CMP_LT_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lt_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LT_I16
+
+    Inst_VOPC__V_CMP_LT_I16::~Inst_VOPC__V_CMP_LT_I16()
+    {
+    } // ~Inst_VOPC__V_CMP_LT_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LT_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_EQ_I16 class methods ---
+
+    Inst_VOPC__V_CMP_EQ_I16::Inst_VOPC__V_CMP_EQ_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_eq_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_EQ_I16
+
+    Inst_VOPC__V_CMP_EQ_I16::~Inst_VOPC__V_CMP_EQ_I16()
+    {
+    } // ~Inst_VOPC__V_CMP_EQ_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LE_I16 class methods ---
+
+    Inst_VOPC__V_CMP_LE_I16::Inst_VOPC__V_CMP_LE_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_le_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LE_I16
+
+    Inst_VOPC__V_CMP_LE_I16::~Inst_VOPC__V_CMP_LE_I16()
+    {
+    } // ~Inst_VOPC__V_CMP_LE_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GT_I16 class methods ---
+
+    Inst_VOPC__V_CMP_GT_I16::Inst_VOPC__V_CMP_GT_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_gt_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GT_I16
+
+    Inst_VOPC__V_CMP_GT_I16::~Inst_VOPC__V_CMP_GT_I16()
+    {
+    } // ~Inst_VOPC__V_CMP_GT_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GT_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NE_I16 class methods ---
+
+    Inst_VOPC__V_CMP_NE_I16::Inst_VOPC__V_CMP_NE_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ne_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_NE_I16
+
+    Inst_VOPC__V_CMP_NE_I16::~Inst_VOPC__V_CMP_NE_I16()
+    {
+    } // ~Inst_VOPC__V_CMP_NE_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GE_I16 class methods ---
+
+    Inst_VOPC__V_CMP_GE_I16::Inst_VOPC__V_CMP_GE_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ge_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GE_I16
+
+    Inst_VOPC__V_CMP_GE_I16::~Inst_VOPC__V_CMP_GE_I16()
+    {
+    } // ~Inst_VOPC__V_CMP_GE_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_T_I16 class methods ---
+
+    Inst_VOPC__V_CMP_T_I16::Inst_VOPC__V_CMP_T_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_t_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_T_I16
+
+    Inst_VOPC__V_CMP_T_I16::~Inst_VOPC__V_CMP_T_I16()
+    {
+    } // ~Inst_VOPC__V_CMP_T_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_T_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_F_U16 class methods ---
+
+    Inst_VOPC__V_CMP_F_U16::Inst_VOPC__V_CMP_F_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_f_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_F_U16
+
+    Inst_VOPC__V_CMP_F_U16::~Inst_VOPC__V_CMP_F_U16()
+    {
+    } // ~Inst_VOPC__V_CMP_F_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_F_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LT_U16 class methods ---
+
+    Inst_VOPC__V_CMP_LT_U16::Inst_VOPC__V_CMP_LT_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lt_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LT_U16
+
+    Inst_VOPC__V_CMP_LT_U16::~Inst_VOPC__V_CMP_LT_U16()
+    {
+    } // ~Inst_VOPC__V_CMP_LT_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LT_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_EQ_U16 class methods ---
+
+    Inst_VOPC__V_CMP_EQ_U16::Inst_VOPC__V_CMP_EQ_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_eq_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_EQ_U16
+
+    Inst_VOPC__V_CMP_EQ_U16::~Inst_VOPC__V_CMP_EQ_U16()
+    {
+    } // ~Inst_VOPC__V_CMP_EQ_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LE_U16 class methods ---
+
+    Inst_VOPC__V_CMP_LE_U16::Inst_VOPC__V_CMP_LE_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_le_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LE_U16
+
+    Inst_VOPC__V_CMP_LE_U16::~Inst_VOPC__V_CMP_LE_U16()
+    {
+    } // ~Inst_VOPC__V_CMP_LE_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GT_U16 class methods ---
+
+    Inst_VOPC__V_CMP_GT_U16::Inst_VOPC__V_CMP_GT_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_gt_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GT_U16
+
+    Inst_VOPC__V_CMP_GT_U16::~Inst_VOPC__V_CMP_GT_U16()
+    {
+    } // ~Inst_VOPC__V_CMP_GT_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GT_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NE_U16 class methods ---
+
+    Inst_VOPC__V_CMP_NE_U16::Inst_VOPC__V_CMP_NE_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ne_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_NE_U16
+
+    Inst_VOPC__V_CMP_NE_U16::~Inst_VOPC__V_CMP_NE_U16()
+    {
+    } // ~Inst_VOPC__V_CMP_NE_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GE_U16 class methods ---
+
+    Inst_VOPC__V_CMP_GE_U16::Inst_VOPC__V_CMP_GE_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ge_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GE_U16
+
+    Inst_VOPC__V_CMP_GE_U16::~Inst_VOPC__V_CMP_GE_U16()
+    {
+    } // ~Inst_VOPC__V_CMP_GE_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_T_U16 class methods ---
+
+    Inst_VOPC__V_CMP_T_U16::Inst_VOPC__V_CMP_T_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_t_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_T_U16
+
+    Inst_VOPC__V_CMP_T_U16::~Inst_VOPC__V_CMP_T_U16()
+    {
+    } // ~Inst_VOPC__V_CMP_T_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_T_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_F_I16 class methods ---
+
+    Inst_VOPC__V_CMPX_F_I16::Inst_VOPC__V_CMPX_F_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_f_i16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_F_I16
+
+    Inst_VOPC__V_CMPX_F_I16::~Inst_VOPC__V_CMPX_F_I16()
+    {
+    } // ~Inst_VOPC__V_CMPX_F_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_F_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LT_I16 class methods ---
+
+    Inst_VOPC__V_CMPX_LT_I16::Inst_VOPC__V_CMPX_LT_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lt_i16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LT_I16
+
+    Inst_VOPC__V_CMPX_LT_I16::~Inst_VOPC__V_CMPX_LT_I16()
+    {
+    } // ~Inst_VOPC__V_CMPX_LT_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LT_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_EQ_I16 class methods ---
+
+    Inst_VOPC__V_CMPX_EQ_I16::Inst_VOPC__V_CMPX_EQ_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_eq_i16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_EQ_I16
+
+    Inst_VOPC__V_CMPX_EQ_I16::~Inst_VOPC__V_CMPX_EQ_I16()
+    {
+    } // ~Inst_VOPC__V_CMPX_EQ_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LE_I16 class methods ---
+
+    Inst_VOPC__V_CMPX_LE_I16::Inst_VOPC__V_CMPX_LE_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_le_i16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LE_I16
+
+    Inst_VOPC__V_CMPX_LE_I16::~Inst_VOPC__V_CMPX_LE_I16()
+    {
+    } // ~Inst_VOPC__V_CMPX_LE_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GT_I16 class methods ---
+
+    Inst_VOPC__V_CMPX_GT_I16::Inst_VOPC__V_CMPX_GT_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_gt_i16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GT_I16
+
+    Inst_VOPC__V_CMPX_GT_I16::~Inst_VOPC__V_CMPX_GT_I16()
+    {
+    } // ~Inst_VOPC__V_CMPX_GT_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GT_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NE_I16 class methods ---
+
+    Inst_VOPC__V_CMPX_NE_I16::Inst_VOPC__V_CMPX_NE_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ne_i16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NE_I16
+
+    Inst_VOPC__V_CMPX_NE_I16::~Inst_VOPC__V_CMPX_NE_I16()
+    {
+    } // ~Inst_VOPC__V_CMPX_NE_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GE_I16 class methods ---
+
+    Inst_VOPC__V_CMPX_GE_I16::Inst_VOPC__V_CMPX_GE_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ge_i16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GE_I16
+
+    Inst_VOPC__V_CMPX_GE_I16::~Inst_VOPC__V_CMPX_GE_I16()
+    {
+    } // ~Inst_VOPC__V_CMPX_GE_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_T_I16 class methods ---
+
+    Inst_VOPC__V_CMPX_T_I16::Inst_VOPC__V_CMPX_T_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_t_i16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_T_I16
+
+    Inst_VOPC__V_CMPX_T_I16::~Inst_VOPC__V_CMPX_T_I16()
+    {
+    } // ~Inst_VOPC__V_CMPX_T_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_T_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_F_U16 class methods ---
+
+    Inst_VOPC__V_CMPX_F_U16::Inst_VOPC__V_CMPX_F_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_f_u16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_F_U16
+
+    Inst_VOPC__V_CMPX_F_U16::~Inst_VOPC__V_CMPX_F_U16()
+    {
+    } // ~Inst_VOPC__V_CMPX_F_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_F_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LT_U16 class methods ---
+
+    Inst_VOPC__V_CMPX_LT_U16::Inst_VOPC__V_CMPX_LT_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lt_u16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LT_U16
+
+    Inst_VOPC__V_CMPX_LT_U16::~Inst_VOPC__V_CMPX_LT_U16()
+    {
+    } // ~Inst_VOPC__V_CMPX_LT_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LT_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_EQ_U16 class methods ---
+
+    Inst_VOPC__V_CMPX_EQ_U16::Inst_VOPC__V_CMPX_EQ_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_eq_u16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_EQ_U16
+
+    Inst_VOPC__V_CMPX_EQ_U16::~Inst_VOPC__V_CMPX_EQ_U16()
+    {
+    } // ~Inst_VOPC__V_CMPX_EQ_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LE_U16 class methods ---
+
+    Inst_VOPC__V_CMPX_LE_U16::Inst_VOPC__V_CMPX_LE_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_le_u16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LE_U16
+
+    Inst_VOPC__V_CMPX_LE_U16::~Inst_VOPC__V_CMPX_LE_U16()
+    {
+    } // ~Inst_VOPC__V_CMPX_LE_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GT_U16 class methods ---
+
+    Inst_VOPC__V_CMPX_GT_U16::Inst_VOPC__V_CMPX_GT_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_gt_u16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GT_U16
+
+    Inst_VOPC__V_CMPX_GT_U16::~Inst_VOPC__V_CMPX_GT_U16()
+    {
+    } // ~Inst_VOPC__V_CMPX_GT_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GT_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NE_U16 class methods ---
+
+    Inst_VOPC__V_CMPX_NE_U16::Inst_VOPC__V_CMPX_NE_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ne_u16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NE_U16
+
+    Inst_VOPC__V_CMPX_NE_U16::~Inst_VOPC__V_CMPX_NE_U16()
+    {
+    } // ~Inst_VOPC__V_CMPX_NE_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GE_U16 class methods ---
+
+    Inst_VOPC__V_CMPX_GE_U16::Inst_VOPC__V_CMPX_GE_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ge_u16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GE_U16
+
+    Inst_VOPC__V_CMPX_GE_U16::~Inst_VOPC__V_CMPX_GE_U16()
+    {
+    } // ~Inst_VOPC__V_CMPX_GE_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_T_U16 class methods ---
+
+    Inst_VOPC__V_CMPX_T_U16::Inst_VOPC__V_CMPX_T_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_t_u16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_T_U16
+
+    Inst_VOPC__V_CMPX_T_U16::~Inst_VOPC__V_CMPX_T_U16()
+    {
+    } // ~Inst_VOPC__V_CMPX_T_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_T_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_F_I32 class methods ---
+
+    Inst_VOPC__V_CMP_F_I32::Inst_VOPC__V_CMP_F_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_f_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_F_I32
+
+    Inst_VOPC__V_CMP_F_I32::~Inst_VOPC__V_CMP_F_I32()
+    {
+    } // ~Inst_VOPC__V_CMP_F_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_F_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LT_I32 class methods ---
+
+    Inst_VOPC__V_CMP_LT_I32::Inst_VOPC__V_CMP_LT_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lt_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LT_I32
+
+    Inst_VOPC__V_CMP_LT_I32::~Inst_VOPC__V_CMP_LT_I32()
+    {
+    } // ~Inst_VOPC__V_CMP_LT_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_EQ_I32 class methods ---
+
+    Inst_VOPC__V_CMP_EQ_I32::Inst_VOPC__V_CMP_EQ_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_eq_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_EQ_I32
+
+    Inst_VOPC__V_CMP_EQ_I32::~Inst_VOPC__V_CMP_EQ_I32()
+    {
+    } // ~Inst_VOPC__V_CMP_EQ_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LE_I32 class methods ---
+
+    Inst_VOPC__V_CMP_LE_I32::Inst_VOPC__V_CMP_LE_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_le_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LE_I32
+
+    Inst_VOPC__V_CMP_LE_I32::~Inst_VOPC__V_CMP_LE_I32()
+    {
+    } // ~Inst_VOPC__V_CMP_LE_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GT_I32 class methods ---
+
+    Inst_VOPC__V_CMP_GT_I32::Inst_VOPC__V_CMP_GT_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_gt_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GT_I32
+
+    Inst_VOPC__V_CMP_GT_I32::~Inst_VOPC__V_CMP_GT_I32()
+    {
+    } // ~Inst_VOPC__V_CMP_GT_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NE_I32 class methods ---
+
+    Inst_VOPC__V_CMP_NE_I32::Inst_VOPC__V_CMP_NE_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ne_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_NE_I32
+
+    Inst_VOPC__V_CMP_NE_I32::~Inst_VOPC__V_CMP_NE_I32()
+    {
+    } // ~Inst_VOPC__V_CMP_NE_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GE_I32 class methods ---
+
+    Inst_VOPC__V_CMP_GE_I32::Inst_VOPC__V_CMP_GE_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ge_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GE_I32
+
+    Inst_VOPC__V_CMP_GE_I32::~Inst_VOPC__V_CMP_GE_I32()
+    {
+    } // ~Inst_VOPC__V_CMP_GE_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_T_I32 class methods ---
+
+    Inst_VOPC__V_CMP_T_I32::Inst_VOPC__V_CMP_T_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_t_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_T_I32
+
+    Inst_VOPC__V_CMP_T_I32::~Inst_VOPC__V_CMP_T_I32()
+    {
+    } // ~Inst_VOPC__V_CMP_T_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_T_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_F_U32 class methods ---
+
+    Inst_VOPC__V_CMP_F_U32::Inst_VOPC__V_CMP_F_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_f_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_F_U32
+
+    Inst_VOPC__V_CMP_F_U32::~Inst_VOPC__V_CMP_F_U32()
+    {
+    } // ~Inst_VOPC__V_CMP_F_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_F_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LT_U32 class methods ---
+
+    Inst_VOPC__V_CMP_LT_U32::Inst_VOPC__V_CMP_LT_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lt_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LT_U32
+
+    Inst_VOPC__V_CMP_LT_U32::~Inst_VOPC__V_CMP_LT_U32()
+    {
+    } // ~Inst_VOPC__V_CMP_LT_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_EQ_U32 class methods ---
+
+    Inst_VOPC__V_CMP_EQ_U32::Inst_VOPC__V_CMP_EQ_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_eq_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_EQ_U32
+
+    Inst_VOPC__V_CMP_EQ_U32::~Inst_VOPC__V_CMP_EQ_U32()
+    {
+    } // ~Inst_VOPC__V_CMP_EQ_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LE_U32 class methods ---
+
+    Inst_VOPC__V_CMP_LE_U32::Inst_VOPC__V_CMP_LE_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_le_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LE_U32
+
+    Inst_VOPC__V_CMP_LE_U32::~Inst_VOPC__V_CMP_LE_U32()
+    {
+    } // ~Inst_VOPC__V_CMP_LE_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GT_U32 class methods ---
+
+    Inst_VOPC__V_CMP_GT_U32::Inst_VOPC__V_CMP_GT_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_gt_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GT_U32
+
+    Inst_VOPC__V_CMP_GT_U32::~Inst_VOPC__V_CMP_GT_U32()
+    {
+    } // ~Inst_VOPC__V_CMP_GT_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NE_U32 class methods ---
+
+    Inst_VOPC__V_CMP_NE_U32::Inst_VOPC__V_CMP_NE_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ne_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_NE_U32
+
+    Inst_VOPC__V_CMP_NE_U32::~Inst_VOPC__V_CMP_NE_U32()
+    {
+    } // ~Inst_VOPC__V_CMP_NE_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GE_U32 class methods ---
+
+    Inst_VOPC__V_CMP_GE_U32::Inst_VOPC__V_CMP_GE_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ge_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GE_U32
+
+    Inst_VOPC__V_CMP_GE_U32::~Inst_VOPC__V_CMP_GE_U32()
+    {
+    } // ~Inst_VOPC__V_CMP_GE_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_T_U32 class methods ---
+
+    Inst_VOPC__V_CMP_T_U32::Inst_VOPC__V_CMP_T_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_t_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_T_U32
+
+    Inst_VOPC__V_CMP_T_U32::~Inst_VOPC__V_CMP_T_U32()
+    {
+    } // ~Inst_VOPC__V_CMP_T_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_T_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_F_I32 class methods ---
+
+    Inst_VOPC__V_CMPX_F_I32::Inst_VOPC__V_CMPX_F_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_f_i32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_F_I32
+
+    Inst_VOPC__V_CMPX_F_I32::~Inst_VOPC__V_CMPX_F_I32()
+    {
+    } // ~Inst_VOPC__V_CMPX_F_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_F_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LT_I32 class methods ---
+
+    Inst_VOPC__V_CMPX_LT_I32::Inst_VOPC__V_CMPX_LT_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lt_i32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LT_I32
+
+    Inst_VOPC__V_CMPX_LT_I32::~Inst_VOPC__V_CMPX_LT_I32()
+    {
+    } // ~Inst_VOPC__V_CMPX_LT_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_EQ_I32 class methods ---
+
+    Inst_VOPC__V_CMPX_EQ_I32::Inst_VOPC__V_CMPX_EQ_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_eq_i32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_EQ_I32
+
+    Inst_VOPC__V_CMPX_EQ_I32::~Inst_VOPC__V_CMPX_EQ_I32()
+    {
+    } // ~Inst_VOPC__V_CMPX_EQ_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LE_I32 class methods ---
+
+    Inst_VOPC__V_CMPX_LE_I32::Inst_VOPC__V_CMPX_LE_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_le_i32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LE_I32
+
+    Inst_VOPC__V_CMPX_LE_I32::~Inst_VOPC__V_CMPX_LE_I32()
+    {
+    } // ~Inst_VOPC__V_CMPX_LE_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GT_I32 class methods ---
+
+    Inst_VOPC__V_CMPX_GT_I32::Inst_VOPC__V_CMPX_GT_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_gt_i32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GT_I32
+
+    Inst_VOPC__V_CMPX_GT_I32::~Inst_VOPC__V_CMPX_GT_I32()
+    {
+    } // ~Inst_VOPC__V_CMPX_GT_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NE_I32 class methods ---
+
+    Inst_VOPC__V_CMPX_NE_I32::Inst_VOPC__V_CMPX_NE_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ne_i32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NE_I32
+
+    Inst_VOPC__V_CMPX_NE_I32::~Inst_VOPC__V_CMPX_NE_I32()
+    {
+    } // ~Inst_VOPC__V_CMPX_NE_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GE_I32 class methods ---
+
+    Inst_VOPC__V_CMPX_GE_I32::Inst_VOPC__V_CMPX_GE_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ge_i32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GE_I32
+
+    Inst_VOPC__V_CMPX_GE_I32::~Inst_VOPC__V_CMPX_GE_I32()
+    {
+    } // ~Inst_VOPC__V_CMPX_GE_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_T_I32 class methods ---
+
+    Inst_VOPC__V_CMPX_T_I32::Inst_VOPC__V_CMPX_T_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_t_i32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_T_I32
+
+    Inst_VOPC__V_CMPX_T_I32::~Inst_VOPC__V_CMPX_T_I32()
+    {
+    } // ~Inst_VOPC__V_CMPX_T_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_T_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_F_U32 class methods ---
+
+    Inst_VOPC__V_CMPX_F_U32::Inst_VOPC__V_CMPX_F_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_f_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_F_U32
+
+    Inst_VOPC__V_CMPX_F_U32::~Inst_VOPC__V_CMPX_F_U32()
+    {
+    } // ~Inst_VOPC__V_CMPX_F_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_F_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LT_U32 class methods ---
+
+    Inst_VOPC__V_CMPX_LT_U32::Inst_VOPC__V_CMPX_LT_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lt_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LT_U32
+
+    Inst_VOPC__V_CMPX_LT_U32::~Inst_VOPC__V_CMPX_LT_U32()
+    {
+    } // ~Inst_VOPC__V_CMPX_LT_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_EQ_U32 class methods ---
+
+    Inst_VOPC__V_CMPX_EQ_U32::Inst_VOPC__V_CMPX_EQ_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_eq_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_EQ_U32
+
+    Inst_VOPC__V_CMPX_EQ_U32::~Inst_VOPC__V_CMPX_EQ_U32()
+    {
+    } // ~Inst_VOPC__V_CMPX_EQ_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LE_U32 class methods ---
+
+    Inst_VOPC__V_CMPX_LE_U32::Inst_VOPC__V_CMPX_LE_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_le_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LE_U32
+
+    Inst_VOPC__V_CMPX_LE_U32::~Inst_VOPC__V_CMPX_LE_U32()
+    {
+    } // ~Inst_VOPC__V_CMPX_LE_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GT_U32 class methods ---
+
+    Inst_VOPC__V_CMPX_GT_U32::Inst_VOPC__V_CMPX_GT_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_gt_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GT_U32
+
+    Inst_VOPC__V_CMPX_GT_U32::~Inst_VOPC__V_CMPX_GT_U32()
+    {
+    } // ~Inst_VOPC__V_CMPX_GT_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NE_U32 class methods ---
+
+    Inst_VOPC__V_CMPX_NE_U32::Inst_VOPC__V_CMPX_NE_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ne_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NE_U32
+
+    Inst_VOPC__V_CMPX_NE_U32::~Inst_VOPC__V_CMPX_NE_U32()
+    {
+    } // ~Inst_VOPC__V_CMPX_NE_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GE_U32 class methods ---
+
+    Inst_VOPC__V_CMPX_GE_U32::Inst_VOPC__V_CMPX_GE_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ge_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GE_U32
+
+    Inst_VOPC__V_CMPX_GE_U32::~Inst_VOPC__V_CMPX_GE_U32()
+    {
+    } // ~Inst_VOPC__V_CMPX_GE_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_T_U32 class methods ---
+
+    Inst_VOPC__V_CMPX_T_U32::Inst_VOPC__V_CMPX_T_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_t_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_T_U32
+
+    Inst_VOPC__V_CMPX_T_U32::~Inst_VOPC__V_CMPX_T_U32()
+    {
+    } // ~Inst_VOPC__V_CMPX_T_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_T_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_F_I64 class methods ---
+
+    Inst_VOPC__V_CMP_F_I64::Inst_VOPC__V_CMP_F_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_f_i64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_F_I64
+
+    Inst_VOPC__V_CMP_F_I64::~Inst_VOPC__V_CMP_F_I64()
+    {
+    } // ~Inst_VOPC__V_CMP_F_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_F_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LT_I64 class methods ---
+
+    Inst_VOPC__V_CMP_LT_I64::Inst_VOPC__V_CMP_LT_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lt_i64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LT_I64
+
+    Inst_VOPC__V_CMP_LT_I64::~Inst_VOPC__V_CMP_LT_I64()
+    {
+    } // ~Inst_VOPC__V_CMP_LT_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LT_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_EQ_I64 class methods ---
+
+    Inst_VOPC__V_CMP_EQ_I64::Inst_VOPC__V_CMP_EQ_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_eq_i64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_EQ_I64
+
+    Inst_VOPC__V_CMP_EQ_I64::~Inst_VOPC__V_CMP_EQ_I64()
+    {
+    } // ~Inst_VOPC__V_CMP_EQ_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LE_I64 class methods ---
+
+    Inst_VOPC__V_CMP_LE_I64::Inst_VOPC__V_CMP_LE_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_le_i64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LE_I64
+
+    Inst_VOPC__V_CMP_LE_I64::~Inst_VOPC__V_CMP_LE_I64()
+    {
+    } // ~Inst_VOPC__V_CMP_LE_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GT_I64 class methods ---
+
+    Inst_VOPC__V_CMP_GT_I64::Inst_VOPC__V_CMP_GT_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_gt_i64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GT_I64
+
+    Inst_VOPC__V_CMP_GT_I64::~Inst_VOPC__V_CMP_GT_I64()
+    {
+    } // ~Inst_VOPC__V_CMP_GT_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GT_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NE_I64 class methods ---
+
+    Inst_VOPC__V_CMP_NE_I64::Inst_VOPC__V_CMP_NE_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ne_i64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_NE_I64
+
+    Inst_VOPC__V_CMP_NE_I64::~Inst_VOPC__V_CMP_NE_I64()
+    {
+    } // ~Inst_VOPC__V_CMP_NE_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GE_I64 class methods ---
+
+    Inst_VOPC__V_CMP_GE_I64::Inst_VOPC__V_CMP_GE_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ge_i64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GE_I64
+
+    Inst_VOPC__V_CMP_GE_I64::~Inst_VOPC__V_CMP_GE_I64()
+    {
+    } // ~Inst_VOPC__V_CMP_GE_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_T_I64 class methods ---
+
+    Inst_VOPC__V_CMP_T_I64::Inst_VOPC__V_CMP_T_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_t_i64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_T_I64
+
+    Inst_VOPC__V_CMP_T_I64::~Inst_VOPC__V_CMP_T_I64()
+    {
+    } // ~Inst_VOPC__V_CMP_T_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_T_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_F_U64 class methods ---
+
+    Inst_VOPC__V_CMP_F_U64::Inst_VOPC__V_CMP_F_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_f_u64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_F_U64
+
+    Inst_VOPC__V_CMP_F_U64::~Inst_VOPC__V_CMP_F_U64()
+    {
+    } // ~Inst_VOPC__V_CMP_F_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_F_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LT_U64 class methods ---
+
+    Inst_VOPC__V_CMP_LT_U64::Inst_VOPC__V_CMP_LT_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lt_u64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LT_U64
+
+    Inst_VOPC__V_CMP_LT_U64::~Inst_VOPC__V_CMP_LT_U64()
+    {
+    } // ~Inst_VOPC__V_CMP_LT_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LT_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_EQ_U64 class methods ---
+
+    Inst_VOPC__V_CMP_EQ_U64::Inst_VOPC__V_CMP_EQ_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_eq_u64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_EQ_U64
+
+    Inst_VOPC__V_CMP_EQ_U64::~Inst_VOPC__V_CMP_EQ_U64()
+    {
+    } // ~Inst_VOPC__V_CMP_EQ_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LE_U64 class methods ---
+
+    Inst_VOPC__V_CMP_LE_U64::Inst_VOPC__V_CMP_LE_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_le_u64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LE_U64
+
+    Inst_VOPC__V_CMP_LE_U64::~Inst_VOPC__V_CMP_LE_U64()
+    {
+    } // ~Inst_VOPC__V_CMP_LE_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GT_U64 class methods ---
+
+    Inst_VOPC__V_CMP_GT_U64::Inst_VOPC__V_CMP_GT_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_gt_u64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GT_U64
+
+    Inst_VOPC__V_CMP_GT_U64::~Inst_VOPC__V_CMP_GT_U64()
+    {
+    } // ~Inst_VOPC__V_CMP_GT_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GT_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NE_U64 class methods ---
+
+    Inst_VOPC__V_CMP_NE_U64::Inst_VOPC__V_CMP_NE_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ne_u64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_NE_U64
+
+    Inst_VOPC__V_CMP_NE_U64::~Inst_VOPC__V_CMP_NE_U64()
+    {
+    } // ~Inst_VOPC__V_CMP_NE_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GE_U64 class methods ---
+
+    Inst_VOPC__V_CMP_GE_U64::Inst_VOPC__V_CMP_GE_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ge_u64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GE_U64
+
+    Inst_VOPC__V_CMP_GE_U64::~Inst_VOPC__V_CMP_GE_U64()
+    {
+    } // ~Inst_VOPC__V_CMP_GE_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_T_U64 class methods ---
+
+    Inst_VOPC__V_CMP_T_U64::Inst_VOPC__V_CMP_T_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_t_u64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_T_U64
+
+    Inst_VOPC__V_CMP_T_U64::~Inst_VOPC__V_CMP_T_U64()
+    {
+    } // ~Inst_VOPC__V_CMP_T_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_T_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_F_I64 class methods ---
+
+    Inst_VOPC__V_CMPX_F_I64::Inst_VOPC__V_CMPX_F_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_f_i64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_F_I64
+
+    Inst_VOPC__V_CMPX_F_I64::~Inst_VOPC__V_CMPX_F_I64()
+    {
+    } // ~Inst_VOPC__V_CMPX_F_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_F_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LT_I64 class methods ---
+
+    Inst_VOPC__V_CMPX_LT_I64::Inst_VOPC__V_CMPX_LT_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lt_i64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LT_I64
+
+    Inst_VOPC__V_CMPX_LT_I64::~Inst_VOPC__V_CMPX_LT_I64()
+    {
+    } // ~Inst_VOPC__V_CMPX_LT_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LT_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_EQ_I64 class methods ---
+
+    Inst_VOPC__V_CMPX_EQ_I64::Inst_VOPC__V_CMPX_EQ_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_eq_i64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_EQ_I64
+
+    Inst_VOPC__V_CMPX_EQ_I64::~Inst_VOPC__V_CMPX_EQ_I64()
+    {
+    } // ~Inst_VOPC__V_CMPX_EQ_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LE_I64 class methods ---
+
+    Inst_VOPC__V_CMPX_LE_I64::Inst_VOPC__V_CMPX_LE_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_le_i64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LE_I64
+
+    Inst_VOPC__V_CMPX_LE_I64::~Inst_VOPC__V_CMPX_LE_I64()
+    {
+    } // ~Inst_VOPC__V_CMPX_LE_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GT_I64 class methods ---
+
+    Inst_VOPC__V_CMPX_GT_I64::Inst_VOPC__V_CMPX_GT_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_gt_i64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GT_I64
+
+    Inst_VOPC__V_CMPX_GT_I64::~Inst_VOPC__V_CMPX_GT_I64()
+    {
+    } // ~Inst_VOPC__V_CMPX_GT_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GT_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NE_I64 class methods ---
+
+    Inst_VOPC__V_CMPX_NE_I64::Inst_VOPC__V_CMPX_NE_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ne_i64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NE_I64
+
+    Inst_VOPC__V_CMPX_NE_I64::~Inst_VOPC__V_CMPX_NE_I64()
+    {
+    } // ~Inst_VOPC__V_CMPX_NE_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GE_I64 class methods ---
+
+    Inst_VOPC__V_CMPX_GE_I64::Inst_VOPC__V_CMPX_GE_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ge_i64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GE_I64
+
+    Inst_VOPC__V_CMPX_GE_I64::~Inst_VOPC__V_CMPX_GE_I64()
+    {
+    } // ~Inst_VOPC__V_CMPX_GE_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_T_I64 class methods ---
+
+    Inst_VOPC__V_CMPX_T_I64::Inst_VOPC__V_CMPX_T_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_t_i64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_T_I64
+
+    Inst_VOPC__V_CMPX_T_I64::~Inst_VOPC__V_CMPX_T_I64()
+    {
+    } // ~Inst_VOPC__V_CMPX_T_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_T_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_F_U64 class methods ---
+
+    Inst_VOPC__V_CMPX_F_U64::Inst_VOPC__V_CMPX_F_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_f_u64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_F_U64
+
+    Inst_VOPC__V_CMPX_F_U64::~Inst_VOPC__V_CMPX_F_U64()
+    {
+    } // ~Inst_VOPC__V_CMPX_F_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_F_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LT_U64 class methods ---
+
+    Inst_VOPC__V_CMPX_LT_U64::Inst_VOPC__V_CMPX_LT_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lt_u64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LT_U64
+
+    Inst_VOPC__V_CMPX_LT_U64::~Inst_VOPC__V_CMPX_LT_U64()
+    {
+    } // ~Inst_VOPC__V_CMPX_LT_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LT_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_EQ_U64 class methods ---
+
+    Inst_VOPC__V_CMPX_EQ_U64::Inst_VOPC__V_CMPX_EQ_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_eq_u64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_EQ_U64
+
+    Inst_VOPC__V_CMPX_EQ_U64::~Inst_VOPC__V_CMPX_EQ_U64()
+    {
+    } // ~Inst_VOPC__V_CMPX_EQ_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LE_U64 class methods ---
+
+    Inst_VOPC__V_CMPX_LE_U64::Inst_VOPC__V_CMPX_LE_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_le_u64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LE_U64
+
+    Inst_VOPC__V_CMPX_LE_U64::~Inst_VOPC__V_CMPX_LE_U64()
+    {
+    } // ~Inst_VOPC__V_CMPX_LE_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GT_U64 class methods ---
+
+    Inst_VOPC__V_CMPX_GT_U64::Inst_VOPC__V_CMPX_GT_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_gt_u64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GT_U64
+
+    Inst_VOPC__V_CMPX_GT_U64::~Inst_VOPC__V_CMPX_GT_U64()
+    {
+    } // ~Inst_VOPC__V_CMPX_GT_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GT_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NE_U64 class methods ---
+
+    Inst_VOPC__V_CMPX_NE_U64::Inst_VOPC__V_CMPX_NE_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ne_u64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NE_U64
+
+    Inst_VOPC__V_CMPX_NE_U64::~Inst_VOPC__V_CMPX_NE_U64()
+    {
+    } // ~Inst_VOPC__V_CMPX_NE_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GE_U64 class methods ---
+
+    Inst_VOPC__V_CMPX_GE_U64::Inst_VOPC__V_CMPX_GE_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ge_u64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GE_U64
+
+    Inst_VOPC__V_CMPX_GE_U64::~Inst_VOPC__V_CMPX_GE_U64()
+    {
+    } // ~Inst_VOPC__V_CMPX_GE_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_T_U64 class methods ---
+
+    Inst_VOPC__V_CMPX_T_U64::Inst_VOPC__V_CMPX_T_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_t_u64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_T_U64
+
+    Inst_VOPC__V_CMPX_T_U64::~Inst_VOPC__V_CMPX_T_U64()
+    {
+    } // ~Inst_VOPC__V_CMPX_T_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_T_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5