From cd91c6321ff82f037149e55ab73dbeb0ada98180 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Fri, 19 Jan 2024 13:02:20 -0600
Subject: [PATCH 1/2] arch-vega: Reorganize instructions to multiple files

The Vega instructions.cc file is 47k lines long which results in both
large compilation times whenever it is modified and long style check
times. This makes iterating over more complex instruction
implementations very time consuming.

This commit moves the instruction definitions to multiple files based on
the instruction encoding (SOP2, VOP2, FLAT, DS, etc.). The resulting
files are much smaller (max is 8k lines) and compilation and style check
times are much more reasonable. Other than moving code around, there are
no functional changes in this commit.

Change-Id: Id4ac8e98ef11a58de5fd328f8a0cd7ce60a11819
---
 src/arch/amdgpu/vega/SConscript            |    19 +
 src/arch/amdgpu/vega/insts/ds.cc           |  4657 ++
 src/arch/amdgpu/vega/insts/exp.cc          |    58 +
 src/arch/amdgpu/vega/insts/flat.cc         |  2138 +
 src/arch/amdgpu/vega/insts/inst_util.hh    |     4 +-
 src/arch/amdgpu/vega/insts/instructions.cc | 46539 -------------------
 src/arch/amdgpu/vega/insts/mimg.cc         |  2047 +
 src/arch/amdgpu/vega/insts/mtbuf.cc        |   584 +
 src/arch/amdgpu/vega/insts/mubuf.cc        |  2789 ++
 src/arch/amdgpu/vega/insts/smem.cc         |  1013 +
 src/arch/amdgpu/vega/insts/sop1.cc         |  1504 +
 src/arch/amdgpu/vega/insts/sop2.cc         |  1555 +
 src/arch/amdgpu/vega/insts/sopc.cc         |   599 +
 src/arch/amdgpu/vega/insts/sopk.cc         |   648 +
 src/arch/amdgpu/vega/insts/sopp.cc         |   900 +
 src/arch/amdgpu/vega/insts/vinterp.cc      |   115 +
 src/arch/amdgpu/vega/insts/vop1.cc         |  2340 +
 src/arch/amdgpu/vega/insts/vop2.cc         |  2187 +
 src/arch/amdgpu/vega/insts/vop3.cc         |  8906 ++++
 src/arch/amdgpu/vega/insts/vop3_cmp.cc     |  8145 ++++
 src/arch/amdgpu/vega/insts/vop3p.cc        |   232 +
 src/arch/amdgpu/vega/insts/vop3p_mai.cc    |   257 +
 src/arch/amdgpu/vega/insts/vopc.cc         |  6590 +++
 23 files changed, 47286 insertions(+), 46540 deletions(-)
 create mode 100644 src/arch/amdgpu/vega/insts/ds.cc
 create mode 100644 src/arch/amdgpu/vega/insts/exp.cc
 create mode 100644 src/arch/amdgpu/vega/insts/flat.cc
 delete mode 100644 src/arch/amdgpu/vega/insts/instructions.cc
 create mode 100644 src/arch/amdgpu/vega/insts/mimg.cc
 create mode 100644 src/arch/amdgpu/vega/insts/mtbuf.cc
 create mode 100644 src/arch/amdgpu/vega/insts/mubuf.cc
 create mode 100644 src/arch/amdgpu/vega/insts/smem.cc
 create mode 100644 src/arch/amdgpu/vega/insts/sop1.cc
 create mode 100644 src/arch/amdgpu/vega/insts/sop2.cc
 create mode 100644 src/arch/amdgpu/vega/insts/sopc.cc
 create mode 100644 src/arch/amdgpu/vega/insts/sopk.cc
 create mode 100644 src/arch/amdgpu/vega/insts/sopp.cc
 create mode 100644 src/arch/amdgpu/vega/insts/vinterp.cc
 create mode 100644 src/arch/amdgpu/vega/insts/vop1.cc
 create mode 100644 src/arch/amdgpu/vega/insts/vop2.cc
 create mode 100644 src/arch/amdgpu/vega/insts/vop3.cc
 create mode 100644 src/arch/amdgpu/vega/insts/vop3_cmp.cc
 create mode 100644 src/arch/amdgpu/vega/insts/vop3p_mai.cc
 create mode 100644 src/arch/amdgpu/vega/insts/vopc.cc

diff --git a/src/arch/amdgpu/vega/SConscript b/src/arch/amdgpu/vega/SConscript
index 019ef279b3..912c02cfdc 100644
--- a/src/arch/amdgpu/vega/SConscript
+++ b/src/arch/amdgpu/vega/SConscript
@@ -56,6 +56,25 @@ if env['CONF']['TARGET_GPU_ISA'] == 'vega':
     Source('isa.cc')
     Source('registers.cc')
 
+    Source('insts/sop2.cc')
+    Source('insts/sopk.cc')
+    Source('insts/sop1.cc')
+    Source('insts/sopc.cc')
+    Source('insts/sopp.cc')
+    Source('insts/smem.cc')
+    Source('insts/vop2.cc')
+    Source('insts/vop1.cc')
+    Source('insts/vopc.cc')
+    Source('insts/vinterp.cc')
+    Source('insts/vop3.cc')
+    Source('insts/vop3_cmp.cc')
+    Source('insts/ds.cc')
+    Source('insts/mubuf.cc')
+    Source('insts/mtbuf.cc')
+    Source('insts/mimg.cc')
+    Source('insts/exp.cc')
+    Source('insts/flat.cc')
     Source('insts/vop3p.cc')
+    Source('insts/vop3p_mai.cc')
 
     DebugFlag('VEGA', 'Debug flag for VEGA GPU ISA')
diff --git a/src/arch/amdgpu/vega/insts/ds.cc b/src/arch/amdgpu/vega/insts/ds.cc
new file mode 100644
index 0000000000..17acdaa287
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/ds.cc
@@ -0,0 +1,4657 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_DS__DS_ADD_U32 class methods ---
+
+    Inst_DS__DS_ADD_U32::Inst_DS__DS_ADD_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_add_u32")
+    {
+        setFlag(MemoryRef);
+        setFlag(GroupSegment);
+        setFlag(AtomicAdd);
+        setFlag(AtomicNoReturn);
+    } // Inst_DS__DS_ADD_U32
+
+    Inst_DS__DS_ADD_U32::~Inst_DS__DS_ADD_U32()
+    {
+    } // ~Inst_DS__DS_ADD_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // MEM[ADDR] += DATA;
+    void
+    Inst_DS__DS_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
+
+        addr.read();
+        data.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_ADD_U32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initAtomicAccess<VecElemU32>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_ADD_U32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_SUB_U32 class methods ---
+
+    Inst_DS__DS_SUB_U32::Inst_DS__DS_SUB_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_sub_u32")
+    {
+    } // Inst_DS__DS_SUB_U32
+
+    Inst_DS__DS_SUB_U32::~Inst_DS__DS_SUB_U32()
+    {
+    } // ~Inst_DS__DS_SUB_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_RSUB_U32 class methods ---
+
+    Inst_DS__DS_RSUB_U32::Inst_DS__DS_RSUB_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_rsub_u32")
+    {
+    } // Inst_DS__DS_RSUB_U32
+
+    Inst_DS__DS_RSUB_U32::~Inst_DS__DS_RSUB_U32()
+    {
+    } // ~Inst_DS__DS_RSUB_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA - MEM[ADDR];
+    // RETURN_DATA = tmp.
+    // Subtraction with reversed operands.
+    void
+    Inst_DS__DS_RSUB_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_INC_U32 class methods ---
+
+    Inst_DS__DS_INC_U32::Inst_DS__DS_INC_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_inc_u32")
+    {
+    } // Inst_DS__DS_INC_U32
+
+    Inst_DS__DS_INC_U32::~Inst_DS__DS_INC_U32()
+    {
+    } // ~Inst_DS__DS_INC_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_INC_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_DEC_U32 class methods ---
+
+    Inst_DS__DS_DEC_U32::Inst_DS__DS_DEC_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_dec_u32")
+    {
+    } // Inst_DS__DS_DEC_U32
+
+    Inst_DS__DS_DEC_U32::~Inst_DS__DS_DEC_U32()
+    {
+    } // ~Inst_DS__DS_DEC_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
+    // (unsigned compare); RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_DEC_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_I32 class methods ---
+
+    Inst_DS__DS_MIN_I32::Inst_DS__DS_MIN_I32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_i32")
+    {
+    } // Inst_DS__DS_MIN_I32
+
+    Inst_DS__DS_MIN_I32::~Inst_DS__DS_MIN_I32()
+    {
+    } // ~Inst_DS__DS_MIN_I32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_I32 class methods ---
+
+    Inst_DS__DS_MAX_I32::Inst_DS__DS_MAX_I32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_i32")
+    {
+    } // Inst_DS__DS_MAX_I32
+
+    Inst_DS__DS_MAX_I32::~Inst_DS__DS_MAX_I32()
+    {
+    } // ~Inst_DS__DS_MAX_I32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_U32 class methods ---
+
+    Inst_DS__DS_MIN_U32::Inst_DS__DS_MIN_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_u32")
+    {
+    } // Inst_DS__DS_MIN_U32
+
+    Inst_DS__DS_MIN_U32::~Inst_DS__DS_MIN_U32()
+    {
+    } // ~Inst_DS__DS_MIN_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_U32 class methods ---
+
+    Inst_DS__DS_MAX_U32::Inst_DS__DS_MAX_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_u32")
+    {
+    } // Inst_DS__DS_MAX_U32
+
+    Inst_DS__DS_MAX_U32::~Inst_DS__DS_MAX_U32()
+    {
+    } // ~Inst_DS__DS_MAX_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_AND_B32 class methods ---
+
+    Inst_DS__DS_AND_B32::Inst_DS__DS_AND_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_and_b32")
+    {
+    } // Inst_DS__DS_AND_B32
+
+    Inst_DS__DS_AND_B32::~Inst_DS__DS_AND_B32()
+    {
+    } // ~Inst_DS__DS_AND_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] &= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_AND_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_OR_B32 class methods ---
+
+    Inst_DS__DS_OR_B32::Inst_DS__DS_OR_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_or_b32")
+    {
+        setFlag(MemoryRef);
+        setFlag(GroupSegment);
+        setFlag(AtomicOr);
+        setFlag(AtomicNoReturn);
+    } // Inst_DS__DS_OR_B32
+
+    Inst_DS__DS_OR_B32::~Inst_DS__DS_OR_B32()
+    {
+    } // ~Inst_DS__DS_OR_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // MEM[ADDR] |= DATA;
+    void
+    Inst_DS__DS_OR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
+
+        addr.read();
+        data.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_OR_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initAtomicAccess<VecElemU32>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_OR_B32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+
+    // --- Inst_DS__DS_XOR_B32 class methods ---
+
+    Inst_DS__DS_XOR_B32::Inst_DS__DS_XOR_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_xor_b32")
+    {
+    } // Inst_DS__DS_XOR_B32
+
+    Inst_DS__DS_XOR_B32::~Inst_DS__DS_XOR_B32()
+    {
+    } // ~Inst_DS__DS_XOR_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] ^= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MSKOR_B32 class methods ---
+
+    Inst_DS__DS_MSKOR_B32::Inst_DS__DS_MSKOR_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_mskor_b32")
+    {
+    } // Inst_DS__DS_MSKOR_B32
+
+    Inst_DS__DS_MSKOR_B32::~Inst_DS__DS_MSKOR_B32()
+    {
+    } // ~Inst_DS__DS_MSKOR_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
+    // RETURN_DATA = tmp.
+    // Masked dword OR, D0 contains the mask and D1 contains the new value.
+    void
+    Inst_DS__DS_MSKOR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRITE_B32 class methods ---
+
+    Inst_DS__DS_WRITE_B32::Inst_DS__DS_WRITE_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write_b32")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE_B32
+
+    Inst_DS__DS_WRITE_B32::~Inst_DS__DS_WRITE_B32()
+    {
+    } // ~Inst_DS__DS_WRITE_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // MEM[ADDR] = DATA.
+    // Write dword.
+    void
+    Inst_DS__DS_WRITE_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
+
+        addr.read();
+        data.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemWrite<VecElemU32>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_WRITE_B32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_WRITE2_B32 class methods ---
+
+    Inst_DS__DS_WRITE2_B32::Inst_DS__DS_WRITE2_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write2_b32")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE2_B32
+
+    Inst_DS__DS_WRITE2_B32::~Inst_DS__DS_WRITE2_B32()
+    {
+    } // ~Inst_DS__DS_WRITE2_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // MEM[ADDR_BASE + OFFSET0 * 4] = DATA;
+    // MEM[ADDR_BASE + OFFSET1 * 4] = DATA2.
+    // Write 2 dwords.
+    void
+    Inst_DS__DS_WRITE2_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
+        ConstVecOperandU32 data1(gpuDynInst, extData.DATA1);
+
+        addr.read();
+        data0.read();
+        data1.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 2]
+                    = data0[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE2_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0 * 4;
+        Addr offset1 = instData.OFFSET1 * 4;
+
+        initDualMemWrite<VecElemU32>(gpuDynInst, offset0, offset1);
+    }
+
+    void
+    Inst_DS__DS_WRITE2_B32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    }
+    // --- Inst_DS__DS_WRITE2ST64_B32 class methods ---
+
+    Inst_DS__DS_WRITE2ST64_B32::Inst_DS__DS_WRITE2ST64_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write2st64_b32")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE2ST64_B32
+
+    Inst_DS__DS_WRITE2ST64_B32::~Inst_DS__DS_WRITE2ST64_B32()
+    {
+    } // ~Inst_DS__DS_WRITE2ST64_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // MEM[ADDR_BASE + OFFSET0 * 4 * 64] = DATA;
+    // MEM[ADDR_BASE + OFFSET1 * 4 * 64] = DATA2;
+    // Write 2 dwords.
+    void
+    Inst_DS__DS_WRITE2ST64_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
+        ConstVecOperandU32 data1(gpuDynInst, extData.DATA1);
+
+        addr.read();
+        data0.read();
+        data1.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 2]
+                    = data0[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE2ST64_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0 * 4 * 64;
+        Addr offset1 = instData.OFFSET1 * 4 * 64;
+
+        initDualMemWrite<VecElemU32>(gpuDynInst, offset0, offset1);
+    }
+
+    void
+    Inst_DS__DS_WRITE2ST64_B32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    }
+    // --- Inst_DS__DS_CMPST_B32 class methods ---
+
+    Inst_DS__DS_CMPST_B32::Inst_DS__DS_CMPST_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_cmpst_b32")
+    {
+    } // Inst_DS__DS_CMPST_B32
+
+    Inst_DS__DS_CMPST_B32::~Inst_DS__DS_CMPST_B32()
+    {
+    } // ~Inst_DS__DS_CMPST_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // src = DATA2;
+    // cmp = DATA;
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    // Compare and store.
+    // Caution, the order of src and cmp are the *opposite* of the
+    // ---  BUFFER_ATOMIC_CMPSWAP opcode.
+    void
+    Inst_DS__DS_CMPST_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_CMPST_F32 class methods ---
+
+    Inst_DS__DS_CMPST_F32::Inst_DS__DS_CMPST_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_cmpst_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_CMPST_F32
+
+    Inst_DS__DS_CMPST_F32::~Inst_DS__DS_CMPST_F32()
+    {
+    } // ~Inst_DS__DS_CMPST_F32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // src = DATA2;
+    // cmp = DATA;
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    // Floating point compare and store that handles NaN/INF/denormal values.
+    // Caution, the order of src and cmp are the *opposite* of the
+    // ---  BUFFER_ATOMIC_FCMPSWAP opcode.
+    void
+    Inst_DS__DS_CMPST_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_F32 class methods ---
+
+    Inst_DS__DS_MIN_F32::Inst_DS__DS_MIN_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_MIN_F32
+
+    Inst_DS__DS_MIN_F32::~Inst_DS__DS_MIN_F32()
+    {
+    } // ~Inst_DS__DS_MIN_F32
+
+    // --- description from .arch file ---
+    // 32b.
+    // tmp = MEM[ADDR];
+    // src = DATA;
+    // cmp = DATA2;
+    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
+    // Floating point minimum that handles NaN/INF/denormal values.
+    // Note that this opcode is slightly more general-purpose than
+    // ---  BUFFER_ATOMIC_FMIN.
+    void
+    Inst_DS__DS_MIN_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_F32 class methods ---
+
+    Inst_DS__DS_MAX_F32::Inst_DS__DS_MAX_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_MAX_F32
+
+    Inst_DS__DS_MAX_F32::~Inst_DS__DS_MAX_F32()
+    {
+    } // ~Inst_DS__DS_MAX_F32
+
+    // --- description from .arch file ---
+    // 32b.
+    // tmp = MEM[ADDR];
+    // src = DATA;
+    // cmp = DATA2;
+    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
+    // Floating point maximum that handles NaN/INF/denormal values.
+    // Note that this opcode is slightly more general-purpose than
+    // ---  BUFFER_ATOMIC_FMAX.
+    void
+    Inst_DS__DS_MAX_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_NOP class methods ---
+
+    Inst_DS__DS_NOP::Inst_DS__DS_NOP(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_nop")
+    {
+        setFlag(Nop);
+    } // Inst_DS__DS_NOP
+
+    Inst_DS__DS_NOP::~Inst_DS__DS_NOP()
+    {
+    } // ~Inst_DS__DS_NOP
+
+    // --- description from .arch file ---
+    // Do nothing.
+    void
+    Inst_DS__DS_NOP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        gpuDynInst->wavefront()->decLGKMInstsIssued();
+    } // execute
+    // --- Inst_DS__DS_ADD_F32 class methods ---
+
+    Inst_DS__DS_ADD_F32::Inst_DS__DS_ADD_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_add_f32")
+    {
+        setFlag(F32);
+        setFlag(MemoryRef);
+        setFlag(GroupSegment);
+        setFlag(AtomicAdd);
+        setFlag(AtomicNoReturn);
+    } // Inst_DS__DS_ADD_F32
+
+    Inst_DS__DS_ADD_F32::~Inst_DS__DS_ADD_F32()
+    {
+    } // ~Inst_DS__DS_ADD_F32
+
+    // --- description from .arch file ---
+    // 32b:
+    // MEM[ADDR] += DATA;
+    // Floating point add that handles NaN/INF/denormal values.
+    void
+    Inst_DS__DS_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandF32 data(gpuDynInst, extData.DATA0);
+
+        addr.read();
+        data.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemF32*>(gpuDynInst->a_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_ADD_F32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initAtomicAccess<VecElemF32>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_ADD_F32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_WRITE_B8 class methods ---
+
+    Inst_DS__DS_WRITE_B8::Inst_DS__DS_WRITE_B8(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write_b8")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE_B8
+
+    Inst_DS__DS_WRITE_B8::~Inst_DS__DS_WRITE_B8()
+    {
+    } // ~Inst_DS__DS_WRITE_B8
+
+    // --- description from .arch file ---
+    // MEM[ADDR] = DATA[7:0].
+    // Byte write.
+    void
+    Inst_DS__DS_WRITE_B8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU8 data(gpuDynInst, extData.DATA0);
+
+        addr.read();
+        data.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU8*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE_B8::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemWrite<VecElemU8>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_WRITE_B8::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_WRITE_B8_D16_HI class methods ---
+
+    Inst_DS__DS_WRITE_B8_D16_HI::Inst_DS__DS_WRITE_B8_D16_HI(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write_b8_d16_hi")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE_B8_D16_HI
+
+    Inst_DS__DS_WRITE_B8_D16_HI::~Inst_DS__DS_WRITE_B8_D16_HI()
+    {
+    } // ~Inst_DS__DS_WRITE_B8_D16_HI
+
+    // --- description from .arch file ---
+    // MEM[ADDR] = DATA[23:16].
+    // Byte write in to high word.
+    void
+    Inst_DS__DS_WRITE_B8_D16_HI::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU8 data(gpuDynInst, extData.DATA0);
+
+        addr.read();
+        data.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU8*>(gpuDynInst->d_data))[lane]
+                    = bits(data[lane], 23, 16);
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE_B8_D16_HI::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemWrite<VecElemU8>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_WRITE_B8_D16_HI::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_WRITE_B16 class methods ---
+
+    Inst_DS__DS_WRITE_B16::Inst_DS__DS_WRITE_B16(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write_b16")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE_B16
+
+    Inst_DS__DS_WRITE_B16::~Inst_DS__DS_WRITE_B16()
+    {
+    } // ~Inst_DS__DS_WRITE_B16
+
+    // --- description from .arch file ---
+    // MEM[ADDR] = DATA[15:0]
+    // Short write.
+    void
+    Inst_DS__DS_WRITE_B16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU16 data(gpuDynInst, extData.DATA0);
+
+        addr.read();
+        data.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU16*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE_B16::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemWrite<VecElemU16>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_WRITE_B16::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_ADD_RTN_U32 class methods ---
+
+    Inst_DS__DS_ADD_RTN_U32::Inst_DS__DS_ADD_RTN_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_add_rtn_u32")
+    {
+    } // Inst_DS__DS_ADD_RTN_U32
+
+    Inst_DS__DS_ADD_RTN_U32::~Inst_DS__DS_ADD_RTN_U32()
+    {
+    } // ~Inst_DS__DS_ADD_RTN_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] += DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_ADD_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_SUB_RTN_U32 class methods ---
+
+    Inst_DS__DS_SUB_RTN_U32::Inst_DS__DS_SUB_RTN_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_sub_rtn_u32")
+    {
+    } // Inst_DS__DS_SUB_RTN_U32
+
+    Inst_DS__DS_SUB_RTN_U32::~Inst_DS__DS_SUB_RTN_U32()
+    {
+    } // ~Inst_DS__DS_SUB_RTN_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_SUB_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_RSUB_RTN_U32 class methods ---
+
+    Inst_DS__DS_RSUB_RTN_U32::Inst_DS__DS_RSUB_RTN_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_rsub_rtn_u32")
+    {
+    } // Inst_DS__DS_RSUB_RTN_U32
+
+    Inst_DS__DS_RSUB_RTN_U32::~Inst_DS__DS_RSUB_RTN_U32()
+    {
+    } // ~Inst_DS__DS_RSUB_RTN_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA - MEM[ADDR];
+    // RETURN_DATA = tmp.
+    // Subtraction with reversed operands.
+    void
+    Inst_DS__DS_RSUB_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_INC_RTN_U32 class methods ---
+
+    Inst_DS__DS_INC_RTN_U32::Inst_DS__DS_INC_RTN_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_inc_rtn_u32")
+    {
+    } // Inst_DS__DS_INC_RTN_U32
+
+    Inst_DS__DS_INC_RTN_U32::~Inst_DS__DS_INC_RTN_U32()
+    {
+    } // ~Inst_DS__DS_INC_RTN_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_INC_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_DEC_RTN_U32 class methods ---
+
+    Inst_DS__DS_DEC_RTN_U32::Inst_DS__DS_DEC_RTN_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_dec_rtn_u32")
+    {
+    } // Inst_DS__DS_DEC_RTN_U32
+
+    Inst_DS__DS_DEC_RTN_U32::~Inst_DS__DS_DEC_RTN_U32()
+    {
+    } // ~Inst_DS__DS_DEC_RTN_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
+    // (unsigned compare); RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_DEC_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_RTN_I32 class methods ---
+
+    Inst_DS__DS_MIN_RTN_I32::Inst_DS__DS_MIN_RTN_I32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_rtn_i32")
+    {
+    } // Inst_DS__DS_MIN_RTN_I32
+
+    Inst_DS__DS_MIN_RTN_I32::~Inst_DS__DS_MIN_RTN_I32()
+    {
+    } // ~Inst_DS__DS_MIN_RTN_I32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_MIN_RTN_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_RTN_I32 class methods ---
+
+    Inst_DS__DS_MAX_RTN_I32::Inst_DS__DS_MAX_RTN_I32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_rtn_i32")
+    {
+    } // Inst_DS__DS_MAX_RTN_I32
+
+    Inst_DS__DS_MAX_RTN_I32::~Inst_DS__DS_MAX_RTN_I32()
+    {
+    } // ~Inst_DS__DS_MAX_RTN_I32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_MAX_RTN_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_RTN_U32 class methods ---
+
+    Inst_DS__DS_MIN_RTN_U32::Inst_DS__DS_MIN_RTN_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_rtn_u32")
+    {
+    } // Inst_DS__DS_MIN_RTN_U32
+
+    Inst_DS__DS_MIN_RTN_U32::~Inst_DS__DS_MIN_RTN_U32()
+    {
+    } // ~Inst_DS__DS_MIN_RTN_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_MIN_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_RTN_U32 class methods ---
+
+    Inst_DS__DS_MAX_RTN_U32::Inst_DS__DS_MAX_RTN_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_rtn_u32")
+    {
+    } // Inst_DS__DS_MAX_RTN_U32
+
+    Inst_DS__DS_MAX_RTN_U32::~Inst_DS__DS_MAX_RTN_U32()
+    {
+    } // ~Inst_DS__DS_MAX_RTN_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_MAX_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_AND_RTN_B32 class methods ---
+
+    Inst_DS__DS_AND_RTN_B32::Inst_DS__DS_AND_RTN_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_and_rtn_b32")
+    {
+    } // Inst_DS__DS_AND_RTN_B32
+
+    Inst_DS__DS_AND_RTN_B32::~Inst_DS__DS_AND_RTN_B32()
+    {
+    } // ~Inst_DS__DS_AND_RTN_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] &= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_AND_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_OR_RTN_B32 class methods ---
+
+    Inst_DS__DS_OR_RTN_B32::Inst_DS__DS_OR_RTN_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_or_rtn_b32")
+    {
+    } // Inst_DS__DS_OR_RTN_B32
+
+    Inst_DS__DS_OR_RTN_B32::~Inst_DS__DS_OR_RTN_B32()
+    {
+    } // ~Inst_DS__DS_OR_RTN_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] |= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_OR_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_XOR_RTN_B32 class methods ---
+
+    Inst_DS__DS_XOR_RTN_B32::Inst_DS__DS_XOR_RTN_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_xor_rtn_b32")
+    {
+    } // Inst_DS__DS_XOR_RTN_B32
+
+    Inst_DS__DS_XOR_RTN_B32::~Inst_DS__DS_XOR_RTN_B32()
+    {
+    } // ~Inst_DS__DS_XOR_RTN_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] ^= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_XOR_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MSKOR_RTN_B32 class methods ---
+
+    Inst_DS__DS_MSKOR_RTN_B32::Inst_DS__DS_MSKOR_RTN_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_mskor_rtn_b32")
+    {
+    } // Inst_DS__DS_MSKOR_RTN_B32
+
+    Inst_DS__DS_MSKOR_RTN_B32::~Inst_DS__DS_MSKOR_RTN_B32()
+    {
+    } // ~Inst_DS__DS_MSKOR_RTN_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
+    // RETURN_DATA = tmp.
+    // Masked dword OR, D0 contains the mask and D1 contains the new value.
+    void
+    Inst_DS__DS_MSKOR_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRXCHG_RTN_B32 class methods ---
+
+    Inst_DS__DS_WRXCHG_RTN_B32::Inst_DS__DS_WRXCHG_RTN_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_wrxchg_rtn_b32")
+    {
+    } // Inst_DS__DS_WRXCHG_RTN_B32
+
+    Inst_DS__DS_WRXCHG_RTN_B32::~Inst_DS__DS_WRXCHG_RTN_B32()
+    {
+    } // ~Inst_DS__DS_WRXCHG_RTN_B32
+
+    // --- description from .arch file ---
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA;
+    // RETURN_DATA = tmp.
+    // Write-exchange operation.
+    void
+    Inst_DS__DS_WRXCHG_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRXCHG2_RTN_B32 class methods ---
+
+    Inst_DS__DS_WRXCHG2_RTN_B32::Inst_DS__DS_WRXCHG2_RTN_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_wrxchg2_rtn_b32")
+    {
+    } // Inst_DS__DS_WRXCHG2_RTN_B32
+
+    Inst_DS__DS_WRXCHG2_RTN_B32::~Inst_DS__DS_WRXCHG2_RTN_B32()
+    {
+    } // ~Inst_DS__DS_WRXCHG2_RTN_B32
+
+    // --- description from .arch file ---
+    // Write-exchange 2 separate dwords.
+    void
+    Inst_DS__DS_WRXCHG2_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRXCHG2ST64_RTN_B32 class methods ---
+
+    Inst_DS__DS_WRXCHG2ST64_RTN_B32::Inst_DS__DS_WRXCHG2ST64_RTN_B32(
+          InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_wrxchg2st64_rtn_b32")
+    {
+    } // Inst_DS__DS_WRXCHG2ST64_RTN_B32
+
+    Inst_DS__DS_WRXCHG2ST64_RTN_B32::~Inst_DS__DS_WRXCHG2ST64_RTN_B32()
+    {
+    } // ~Inst_DS__DS_WRXCHG2ST64_RTN_B32
+
+    // --- description from .arch file ---
+    // Write-exchange 2 separate dwords with a stride of 64 dwords.
+    void
+    Inst_DS__DS_WRXCHG2ST64_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_CMPST_RTN_B32 class methods ---
+
+    Inst_DS__DS_CMPST_RTN_B32::Inst_DS__DS_CMPST_RTN_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_cmpst_rtn_b32")
+    {
+    } // Inst_DS__DS_CMPST_RTN_B32
+
+    Inst_DS__DS_CMPST_RTN_B32::~Inst_DS__DS_CMPST_RTN_B32()
+    {
+    } // ~Inst_DS__DS_CMPST_RTN_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // src = DATA2;
+    // cmp = DATA;
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    // Compare and store.
+    // Caution, the order of src and cmp are the *opposite* of the
+    // ---  BUFFER_ATOMIC_CMPSWAP opcode.
+    void
+    Inst_DS__DS_CMPST_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_CMPST_RTN_F32 class methods ---
+
+    Inst_DS__DS_CMPST_RTN_F32::Inst_DS__DS_CMPST_RTN_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_cmpst_rtn_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_CMPST_RTN_F32
+
+    Inst_DS__DS_CMPST_RTN_F32::~Inst_DS__DS_CMPST_RTN_F32()
+    {
+    } // ~Inst_DS__DS_CMPST_RTN_F32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // src = DATA2;
+    // cmp = DATA;
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    // Floating point compare and store that handles NaN/INF/denormal values.
+    // Caution, the order of src and cmp are the *opposite* of the
+    // ---  BUFFER_ATOMIC_FCMPSWAP opcode.
+    void
+    Inst_DS__DS_CMPST_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_RTN_F32 class methods ---
+
+    Inst_DS__DS_MIN_RTN_F32::Inst_DS__DS_MIN_RTN_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_rtn_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_MIN_RTN_F32
+
+    Inst_DS__DS_MIN_RTN_F32::~Inst_DS__DS_MIN_RTN_F32()
+    {
+    } // ~Inst_DS__DS_MIN_RTN_F32
+
+    // --- description from .arch file ---
+    // 32b.
+    // tmp = MEM[ADDR];
+    // src = DATA;
+    // cmp = DATA2;
+    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
+    // Floating point minimum that handles NaN/INF/denormal values.
+    // Note that this opcode is slightly more general-purpose than
+    // ---  BUFFER_ATOMIC_FMIN.
+    void
+    Inst_DS__DS_MIN_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_RTN_F32 class methods ---
+
+    Inst_DS__DS_MAX_RTN_F32::Inst_DS__DS_MAX_RTN_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_rtn_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_MAX_RTN_F32
+
+    Inst_DS__DS_MAX_RTN_F32::~Inst_DS__DS_MAX_RTN_F32()
+    {
+    } // ~Inst_DS__DS_MAX_RTN_F32
+
+    // --- description from .arch file ---
+    // 32b.
+    // tmp = MEM[ADDR];
+    // src = DATA;
+    // cmp = DATA2;
+    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
+    // Floating point maximum that handles NaN/INF/denormal values.
+    // Note that this opcode is slightly more general-purpose than
+    // ---  BUFFER_ATOMIC_FMAX.
+    void
+    Inst_DS__DS_MAX_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRAP_RTN_B32 class methods ---
+
+    Inst_DS__DS_WRAP_RTN_B32::Inst_DS__DS_WRAP_RTN_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_wrap_rtn_b32")
+    {
+    } // Inst_DS__DS_WRAP_RTN_B32
+
+    Inst_DS__DS_WRAP_RTN_B32::~Inst_DS__DS_WRAP_RTN_B32()
+    {
+    } // ~Inst_DS__DS_WRAP_RTN_B32
+
+    // --- description from .arch file ---
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA) ? tmp - DATA : tmp + DATA2;
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_WRAP_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_ADD_RTN_F32 class methods ---
+
+    Inst_DS__DS_ADD_RTN_F32::Inst_DS__DS_ADD_RTN_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_add_rtn_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_ADD_RTN_F32
+
+    Inst_DS__DS_ADD_RTN_F32::~Inst_DS__DS_ADD_RTN_F32()
+    {
+    } // ~Inst_DS__DS_ADD_RTN_F32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] += DATA;
+    // RETURN_DATA = tmp.
+    // Floating point add that handles NaN/INF/denormal values.
+    void
+    Inst_DS__DS_ADD_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_READ_B32 class methods ---
+
+    Inst_DS__DS_READ_B32::Inst_DS__DS_READ_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read_b32")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ_B32
+
+    Inst_DS__DS_READ_B32::~Inst_DS__DS_READ_B32()
+    {
+    } // ~Inst_DS__DS_READ_B32
+
+    // --- description from .arch file ---
+    // RETURN_DATA = MEM[ADDR].
+    // Dword read.
+    void
+    Inst_DS__DS_READ_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemRead<VecElemU32>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ_B32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane];
+            }
+        }
+
+        vdst.write();
+    } // completeAcc
+    // --- Inst_DS__DS_READ2_B32 class methods ---
+
+    Inst_DS__DS_READ2_B32::Inst_DS__DS_READ2_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read2_b32")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ2_B32
+
+    Inst_DS__DS_READ2_B32::~Inst_DS__DS_READ2_B32()
+    {
+    } // ~Inst_DS__DS_READ2_B32
+
+    // --- description from .arch file ---
+    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 4];
+    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 4].
+    // Read 2 dwords.
+    void
+    Inst_DS__DS_READ2_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ2_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0 * 4;
+        Addr offset1 = instData.OFFSET1 * 4;
+
+        initDualMemRead<VecElemU32>(gpuDynInst, offset0, offset1);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ2_B32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 2];
+                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 2 + 1];
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+    } // completeAcc
+    // --- Inst_DS__DS_READ2ST64_B32 class methods ---
+
+    Inst_DS__DS_READ2ST64_B32::Inst_DS__DS_READ2ST64_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read2st64_b32")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ2ST64_B32
+
+    Inst_DS__DS_READ2ST64_B32::~Inst_DS__DS_READ2ST64_B32()
+    {
+    } // ~Inst_DS__DS_READ2ST64_B32
+
+    // --- description from .arch file ---
+    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 4 * 64];
+    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 4 * 64].
+    // Read 2 dwords.
+    void
+    Inst_DS__DS_READ2ST64_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ2ST64_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = (instData.OFFSET0 * 4 * 64);
+        Addr offset1 = (instData.OFFSET1 * 4 * 64);
+
+        initDualMemRead<VecElemU32>(gpuDynInst, offset0, offset1);
+    }
+
+    void
+    Inst_DS__DS_READ2ST64_B32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 2];
+                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 2 + 1];
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+    }
+    // --- Inst_DS__DS_READ_I8 class methods ---
+
+    Inst_DS__DS_READ_I8::Inst_DS__DS_READ_I8(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read_i8")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ_I8
+
+    Inst_DS__DS_READ_I8::~Inst_DS__DS_READ_I8()
+    {
+    } // ~Inst_DS__DS_READ_I8
+
+    // --- description from .arch file ---
+    // RETURN_DATA = signext(MEM[ADDR][7:0]).
+    // Signed byte read.
+    void
+    Inst_DS__DS_READ_I8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ_I8::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemRead<VecElemI8>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ_I8::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst[lane] = (VecElemU32)sext<8>((reinterpret_cast<VecElemI8*>(
+                    gpuDynInst->d_data))[lane]);
+            }
+        }
+
+        vdst.write();
+    } // completeAcc
+    // --- Inst_DS__DS_READ_U8 class methods ---
+
+    Inst_DS__DS_READ_U8::Inst_DS__DS_READ_U8(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read_u8")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ_U8
+
+    Inst_DS__DS_READ_U8::~Inst_DS__DS_READ_U8()
+    {
+    } // ~Inst_DS__DS_READ_U8
+
+    // --- description from .arch file ---
+    // RETURN_DATA = {24'h0,MEM[ADDR][7:0]}.
+    // Unsigned byte read.
+    void
+    Inst_DS__DS_READ_U8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ_U8::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemRead<VecElemU8>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ_U8::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst[lane] = (VecElemU32)(reinterpret_cast<VecElemU8*>(
+                    gpuDynInst->d_data))[lane];
+            }
+        }
+
+        vdst.write();
+    } // completeAcc
+    // --- Inst_DS__DS_READ_I16 class methods ---
+
+    Inst_DS__DS_READ_I16::Inst_DS__DS_READ_I16(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read_i16")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ_I16
+
+    Inst_DS__DS_READ_I16::~Inst_DS__DS_READ_I16()
+    {
+    } // ~Inst_DS__DS_READ_I16
+
+    // --- description from .arch file ---
+    // RETURN_DATA = signext(MEM[ADDR][15:0]).
+    // Signed short read.
+    void
+    Inst_DS__DS_READ_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_READ_U16 class methods ---
+
+    Inst_DS__DS_READ_U16::Inst_DS__DS_READ_U16(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read_u16")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ_U16
+
+    Inst_DS__DS_READ_U16::~Inst_DS__DS_READ_U16()
+    {
+    } // ~Inst_DS__DS_READ_U16
+
+    // --- description from .arch file ---
+    // RETURN_DATA = {16'h0,MEM[ADDR][15:0]}.
+    // Unsigned short read.
+    void
+    Inst_DS__DS_READ_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+    void
+    Inst_DS__DS_READ_U16::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemRead<VecElemU16>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ_U16::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst[lane] = (VecElemU32)(reinterpret_cast<VecElemU16*>(
+                    gpuDynInst->d_data))[lane];
+            }
+        }
+
+        vdst.write();
+    } // completeAcc
+    // --- Inst_DS__DS_SWIZZLE_B32 class methods ---
+
+    Inst_DS__DS_SWIZZLE_B32::Inst_DS__DS_SWIZZLE_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_swizzle_b32")
+    {
+        /**
+         * While this operation doesn't actually use DS storage we classify
+         * it as a load here because it does a writeback to a VGPR, which
+         * fits in better with the LDS pipeline logic.
+         */
+         setFlag(Load);
+    } // Inst_DS__DS_SWIZZLE_B32
+
+    Inst_DS__DS_SWIZZLE_B32::~Inst_DS__DS_SWIZZLE_B32()
+    {
+    } // ~Inst_DS__DS_SWIZZLE_B32
+
+    // --- description from .arch file ---
+    // RETURN_DATA = swizzle(vgpr_data, offset1:offset0).
+    // Dword swizzle, no data is written to LDS memory; See ds_opcodes.docx for
+    // ---  details.
+    void
+    Inst_DS__DS_SWIZZLE_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        wf->decLGKMInstsIssued();
+
+        if (gpuDynInst->exec_mask.none()) {
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()
+                                ->cyclesToTicks(Cycles(24)));
+
+        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+        /**
+         * The "DS pattern" is comprised of both offset fields. That is, the
+         * swizzle pattern between lanes. Bit 15 of the DS pattern dictates
+         * which swizzle mode to use. There are two different swizzle
+         * patterns: 1) QDMode and 2) Bit-masks mode. If bit 15 is set use
+         * QDMode else use Bit-masks mode. The remaining bits dictate how to
+         * swizzle the lanes.
+         *
+         * QDMode:      Chunks the lanes into 4s and swizzles among them.
+         *              Bits 7:6 dictate where lane 3 (of the current chunk)
+         *              gets its date, 5:4 lane 2, etc.
+         *
+         * Bit-mask:    This mode breaks bits 14:0 into 3 equal-sized chunks.
+         *              14:10 is the xor_mask, 9:5 is the or_mask, and 4:0
+         *              is the and_mask. Each lane is swizzled by performing
+         *              the appropriate operation using these masks.
+         */
+        VecElemU16 ds_pattern = ((instData.OFFSET1 << 8) | instData.OFFSET0);
+
+        data.read();
+
+        if (bits(ds_pattern, 15)) {
+            // QDMode
+            for (int lane = 0; lane < NumVecElemPerVecReg; lane += 4) {
+                /**
+                 * This operation allows data sharing between groups
+                 * of four consecutive threads. Note the increment by
+                 * 4 in the for loop.
+                 */
+                if (gpuDynInst->exec_mask[lane]) {
+                    int index0 = lane + bits(ds_pattern, 1, 0);
+                    panic_if(index0 >= NumVecElemPerVecReg, "%s: index0 (%d) "
+                             "is out of bounds.\n", gpuDynInst->disassemble(),
+                             index0);
+                    vdst[lane]
+                        = gpuDynInst->exec_mask[index0] ? data[index0]: 0;
+                }
+                if (gpuDynInst->exec_mask[lane + 1]) {
+                    int index1 = lane + bits(ds_pattern, 3, 2);
+                    panic_if(index1 >= NumVecElemPerVecReg, "%s: index1 (%d) "
+                             "is out of bounds.\n", gpuDynInst->disassemble(),
+                             index1);
+                    vdst[lane + 1]
+                        = gpuDynInst->exec_mask[index1] ? data[index1]: 0;
+                }
+                if (gpuDynInst->exec_mask[lane + 2]) {
+                    int index2 = lane + bits(ds_pattern, 5, 4);
+                    panic_if(index2 >= NumVecElemPerVecReg, "%s: index2 (%d) "
+                             "is out of bounds.\n", gpuDynInst->disassemble(),
+                             index2);
+                    vdst[lane + 2]
+                        = gpuDynInst->exec_mask[index2] ? data[index2]: 0;
+                }
+                if (gpuDynInst->exec_mask[lane + 3]) {
+                    int index3 = lane + bits(ds_pattern, 7, 6);
+                    panic_if(index3 >= NumVecElemPerVecReg, "%s: index3 (%d) "
+                             "is out of bounds.\n", gpuDynInst->disassemble(),
+                             index3);
+                    vdst[lane + 3]
+                        = gpuDynInst->exec_mask[index3] ? data[index3]: 0;
+                }
+            }
+        } else {
+            // Bit Mode
+            int and_mask = bits(ds_pattern, 4, 0);
+            int or_mask = bits(ds_pattern, 9, 5);
+            int xor_mask = bits(ds_pattern, 14, 10);
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (gpuDynInst->exec_mask[lane]) {
+                    int index = (((lane & and_mask) | or_mask) ^ xor_mask);
+                    // Adjust for the next 32 lanes.
+                    if (lane > 31) {
+                        index += 32;
+                    }
+                    panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is "
+                             "out of bounds.\n", gpuDynInst->disassemble(),
+                             index);
+                    vdst[lane]
+                        = gpuDynInst->exec_mask[index] ? data[index] : 0;
+                }
+            }
+        }
+
+        vdst.write();
+
+        /**
+         * This is needed because we treat this instruction as a load
+         * but it's not an actual memory request.
+         * Without this, the destination register never gets marked as
+         * free, leading to a  possible deadlock
+         */
+        wf->computeUnit->vrf[wf->simdId]->
+            scheduleWriteOperandsFromLoad(wf, gpuDynInst);
+        /**
+         * Similarly, this counter could build up over time, even across
+         * multiple wavefronts, and cause a deadlock.
+         */
+        wf->rdLmReqsInPipe--;
+    } // execute
+    // --- Inst_DS__DS_PERMUTE_B32 class methods ---
+
+    Inst_DS__DS_PERMUTE_B32::Inst_DS__DS_PERMUTE_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_permute_b32")
+    {
+        setFlag(MemoryRef);
+        /**
+         * While this operation doesn't actually use DS storage we classify
+         * it as a load here because it does a writeback to a VGPR, which
+         * fits in better with the LDS pipeline logic.
+         */
+         setFlag(Load);
+    } // Inst_DS__DS_PERMUTE_B32
+
+    Inst_DS__DS_PERMUTE_B32::~Inst_DS__DS_PERMUTE_B32()
+    {
+    } // ~Inst_DS__DS_PERMUTE_B32
+
+    // --- description from .arch file ---
+    // Forward permute.
+    void
+    Inst_DS__DS_PERMUTE_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        wf->decLGKMInstsIssued();
+
+        if (gpuDynInst->exec_mask.none()) {
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()
+                                ->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        addr.read();
+        data.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                /**
+                 * One of the offset fields can be used for the index.
+                 * It is assumed OFFSET0 would be used, as OFFSET1 is
+                 * typically only used for DS ops that operate on two
+                 * disparate pieces of data.
+                 */
+                assert(!instData.OFFSET1);
+                /**
+                 * The address provided is a byte address, but VGPRs are
+                 * 4 bytes, so we must divide by 4 to get the actual VGPR
+                 * index. Additionally, the index is calculated modulo the
+                 * WF size, 64 in this case, so we simply extract bits 7-2.
+                 */
+                int index = bits(addr[lane] + instData.OFFSET0, 7, 2);
+                panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is out "
+                         "of bounds.\n", gpuDynInst->disassemble(), index);
+                /**
+                 * If the shuffled index corresponds to a lane that is
+                 * inactive then this instruction writes a 0 to the active
+                 * lane in VDST.
+                 */
+                if (wf->execMask(index)) {
+                    vdst[index] = data[lane];
+                } else {
+                    vdst[index] = 0;
+                }
+            }
+        }
+
+        vdst.write();
+
+        /**
+         * This is needed because we treat this instruction as a load
+         * but it's not an actual memory request.
+         * Without this, the destination register never gets marked as
+         * free, leading to a  possible deadlock
+         */
+        wf->computeUnit->vrf[wf->simdId]->
+            scheduleWriteOperandsFromLoad(wf, gpuDynInst);
+        /**
+         * Similarly, this counter could build up over time, even across
+         * multiple wavefronts, and cause a deadlock.
+         */
+        wf->rdLmReqsInPipe--;
+    } // execute
+    // --- Inst_DS__DS_BPERMUTE_B32 class methods ---
+
+    Inst_DS__DS_BPERMUTE_B32::Inst_DS__DS_BPERMUTE_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_bpermute_b32")
+    {
+        setFlag(MemoryRef);
+        /**
+         * While this operation doesn't actually use DS storage we classify
+         * it as a load here because it does a writeback to a VGPR, which
+         * fits in better with the LDS pipeline logic.
+         */
+        setFlag(Load);
+    } // Inst_DS__DS_BPERMUTE_B32
+
+    Inst_DS__DS_BPERMUTE_B32::~Inst_DS__DS_BPERMUTE_B32()
+    {
+    } // ~Inst_DS__DS_BPERMUTE_B32
+
+    // --- description from .arch file ---
+    // Backward permute.
+    void
+    Inst_DS__DS_BPERMUTE_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        wf->decLGKMInstsIssued();
+
+        if (gpuDynInst->exec_mask.none()) {
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()
+                                ->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        addr.read();
+        data.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                /**
+                 * One of the offset fields can be used for the index.
+                 * It is assumed OFFSET0 would be used, as OFFSET1 is
+                 * typically only used for DS ops that operate on two
+                 * disparate pieces of data.
+                 */
+                assert(!instData.OFFSET1);
+                /**
+                 * The address provided is a byte address, but VGPRs are
+                 * 4 bytes, so we must divide by 4 to get the actual VGPR
+                 * index. Additionally, the index is calculated modulo the
+                 * WF size, 64 in this case, so we simply extract bits 7-2.
+                 */
+                int index = bits(addr[lane] + instData.OFFSET0, 7, 2);
+                panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is out "
+                         "of bounds.\n", gpuDynInst->disassemble(), index);
+                /**
+                 * If the shuffled index corresponds to a lane that is
+                 * inactive then this instruction writes a 0 to the active
+                 * lane in VDST.
+                 */
+                if (wf->execMask(index)) {
+                    vdst[lane] = data[index];
+                } else {
+                    vdst[lane] = 0;
+                }
+            }
+        }
+
+        vdst.write();
+
+        /**
+         * This is needed because we treat this instruction as a load
+         * but it's not an actual memory request.
+         * Without this, the destination register never gets marked as
+         * free, leading to a  possible deadlock
+         */
+        wf->computeUnit->vrf[wf->simdId]->
+            scheduleWriteOperandsFromLoad(wf, gpuDynInst);
+        /**
+         * Similarly, this counter could build up over time, even across
+         * multiple wavefronts, and cause a deadlock.
+         */
+        wf->rdLmReqsInPipe--;
+    } // execute
+
+    // --- Inst_DS__DS_ADD_U64 class methods ---
+
+    Inst_DS__DS_ADD_U64::Inst_DS__DS_ADD_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_add_u64")
+    {
+        setFlag(MemoryRef);
+        setFlag(GroupSegment);
+        setFlag(AtomicAdd);
+        setFlag(AtomicNoReturn);
+    } // Inst_DS__DS_ADD_U64
+
+    Inst_DS__DS_ADD_U64::~Inst_DS__DS_ADD_U64()
+    {
+    } // ~Inst_DS__DS_ADD_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // MEM[ADDR] += DATA[0:1];
+    void
+    Inst_DS__DS_ADD_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU64 data(gpuDynInst, extData.DATA0);
+
+        addr.read();
+        data.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU64*>(gpuDynInst->a_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_ADD_U64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initAtomicAccess<VecElemU64>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_ADD_U64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_SUB_U64 class methods ---
+
+    Inst_DS__DS_SUB_U64::Inst_DS__DS_SUB_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_sub_u64")
+    {
+    } // Inst_DS__DS_SUB_U64
+
+    Inst_DS__DS_SUB_U64::~Inst_DS__DS_SUB_U64()
+    {
+    } // ~Inst_DS__DS_SUB_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_SUB_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_RSUB_U64 class methods ---
+
+    Inst_DS__DS_RSUB_U64::Inst_DS__DS_RSUB_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_rsub_u64")
+    {
+    } // Inst_DS__DS_RSUB_U64
+
+    Inst_DS__DS_RSUB_U64::~Inst_DS__DS_RSUB_U64()
+    {
+    } // ~Inst_DS__DS_RSUB_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA - MEM[ADDR];
+    // RETURN_DATA = tmp.
+    // Subtraction with reversed operands.
+    void
+    Inst_DS__DS_RSUB_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_INC_U64 class methods ---
+
+    Inst_DS__DS_INC_U64::Inst_DS__DS_INC_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_inc_u64")
+    {
+    } // Inst_DS__DS_INC_U64
+
+    Inst_DS__DS_INC_U64::~Inst_DS__DS_INC_U64()
+    {
+    } // ~Inst_DS__DS_INC_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_INC_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_DEC_U64 class methods ---
+
+    Inst_DS__DS_DEC_U64::Inst_DS__DS_DEC_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_dec_u64")
+    {
+    } // Inst_DS__DS_DEC_U64
+
+    Inst_DS__DS_DEC_U64::~Inst_DS__DS_DEC_U64()
+    {
+    } // ~Inst_DS__DS_DEC_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
+    // (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_DEC_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_I64 class methods ---
+
+    Inst_DS__DS_MIN_I64::Inst_DS__DS_MIN_I64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_i64")
+    {
+    } // Inst_DS__DS_MIN_I64
+
+    Inst_DS__DS_MIN_I64::~Inst_DS__DS_MIN_I64()
+    {
+    } // ~Inst_DS__DS_MIN_I64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_MIN_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_I64 class methods ---
+
+    Inst_DS__DS_MAX_I64::Inst_DS__DS_MAX_I64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_i64")
+    {
+    } // Inst_DS__DS_MAX_I64
+
+    Inst_DS__DS_MAX_I64::~Inst_DS__DS_MAX_I64()
+    {
+    } // ~Inst_DS__DS_MAX_I64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_MAX_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_U64 class methods ---
+
+    Inst_DS__DS_MIN_U64::Inst_DS__DS_MIN_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_u64")
+    {
+    } // Inst_DS__DS_MIN_U64
+
+    Inst_DS__DS_MIN_U64::~Inst_DS__DS_MIN_U64()
+    {
+    } // ~Inst_DS__DS_MIN_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_MIN_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_U64 class methods ---
+
+    Inst_DS__DS_MAX_U64::Inst_DS__DS_MAX_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_u64")
+    {
+    } // Inst_DS__DS_MAX_U64
+
+    Inst_DS__DS_MAX_U64::~Inst_DS__DS_MAX_U64()
+    {
+    } // ~Inst_DS__DS_MAX_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_MAX_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_AND_B64 class methods ---
+
+    Inst_DS__DS_AND_B64::Inst_DS__DS_AND_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_and_b64")
+    {
+    } // Inst_DS__DS_AND_B64
+
+    Inst_DS__DS_AND_B64::~Inst_DS__DS_AND_B64()
+    {
+    } // ~Inst_DS__DS_AND_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] &= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_AND_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_OR_B64 class methods ---
+
+    Inst_DS__DS_OR_B64::Inst_DS__DS_OR_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_or_b64")
+    {
+    } // Inst_DS__DS_OR_B64
+
+    Inst_DS__DS_OR_B64::~Inst_DS__DS_OR_B64()
+    {
+    } // ~Inst_DS__DS_OR_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] |= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_OR_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_XOR_B64 class methods ---
+
+    Inst_DS__DS_XOR_B64::Inst_DS__DS_XOR_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_xor_b64")
+    {
+    } // Inst_DS__DS_XOR_B64
+
+    Inst_DS__DS_XOR_B64::~Inst_DS__DS_XOR_B64()
+    {
+    } // ~Inst_DS__DS_XOR_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] ^= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_XOR_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MSKOR_B64 class methods ---
+
+    Inst_DS__DS_MSKOR_B64::Inst_DS__DS_MSKOR_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_mskor_b64")
+    {
+    } // Inst_DS__DS_MSKOR_B64
+
+    Inst_DS__DS_MSKOR_B64::~Inst_DS__DS_MSKOR_B64()
+    {
+    } // ~Inst_DS__DS_MSKOR_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
+    // RETURN_DATA = tmp.
+    // Masked dword OR, D0 contains the mask and D1 contains the new value.
+    void
+    Inst_DS__DS_MSKOR_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRITE_B64 class methods ---
+
+    Inst_DS__DS_WRITE_B64::Inst_DS__DS_WRITE_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write_b64")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE_B64
+
+    Inst_DS__DS_WRITE_B64::~Inst_DS__DS_WRITE_B64()
+    {
+    } // ~Inst_DS__DS_WRITE_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // MEM[ADDR] = DATA.
+    // Write qword.
+    void
+    Inst_DS__DS_WRITE_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU64 data(gpuDynInst, extData.DATA0);
+
+        addr.read();
+        data.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU64*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemWrite<VecElemU64>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_WRITE_B64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_WRITE2_B64 class methods ---
+
+    Inst_DS__DS_WRITE2_B64::Inst_DS__DS_WRITE2_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write2_b64")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE2_B64
+
+    Inst_DS__DS_WRITE2_B64::~Inst_DS__DS_WRITE2_B64()
+    {
+    } // ~Inst_DS__DS_WRITE2_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // MEM[ADDR_BASE + OFFSET0 * 8] = DATA;
+    // MEM[ADDR_BASE + OFFSET1 * 8] = DATA2.
+    // Write 2 qwords.
+    void
+    Inst_DS__DS_WRITE2_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU64 data0(gpuDynInst, extData.DATA0);
+        ConstVecOperandU64 data1(gpuDynInst, extData.DATA1);
+
+        addr.read();
+        data0.read();
+        data1.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane * 2] = data0[lane];
+                (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE2_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0 * 8;
+        Addr offset1 = instData.OFFSET1 * 8;
+
+        initDualMemWrite<VecElemU64>(gpuDynInst, offset0, offset1);
+    }
+
+    void
+    Inst_DS__DS_WRITE2_B64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    }
+    // --- Inst_DS__DS_WRITE2ST64_B64 class methods ---
+
+    Inst_DS__DS_WRITE2ST64_B64::Inst_DS__DS_WRITE2ST64_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write2st64_b64")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE2ST64_B64
+
+    Inst_DS__DS_WRITE2ST64_B64::~Inst_DS__DS_WRITE2ST64_B64()
+    {
+    } // ~Inst_DS__DS_WRITE2ST64_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // MEM[ADDR_BASE + OFFSET0 * 8 * 64] = DATA;
+    // MEM[ADDR_BASE + OFFSET1 * 8 * 64] = DATA2;
+    // Write 2 qwords.
+    void
+    Inst_DS__DS_WRITE2ST64_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU64 data0(gpuDynInst, extData.DATA0);
+        ConstVecOperandU64 data1(gpuDynInst, extData.DATA1);
+
+        addr.read();
+        data0.read();
+        data1.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane * 2] = data0[lane];
+                (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE2ST64_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0 * 8 * 64;
+        Addr offset1 = instData.OFFSET1 * 8 * 64;
+
+        initDualMemWrite<VecElemU64>(gpuDynInst, offset0, offset1);
+    }
+
+    void
+    Inst_DS__DS_WRITE2ST64_B64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    }
+    // --- Inst_DS__DS_CMPST_B64 class methods ---
+
+    Inst_DS__DS_CMPST_B64::Inst_DS__DS_CMPST_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_cmpst_b64")
+    {
+    } // Inst_DS__DS_CMPST_B64
+
+    Inst_DS__DS_CMPST_B64::~Inst_DS__DS_CMPST_B64()
+    {
+    } // ~Inst_DS__DS_CMPST_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // src = DATA2;
+    // cmp = DATA;
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    // Compare and store.
+    // Caution, the order of src and cmp are the *opposite* of the
+    // ---  BUFFER_ATOMIC_CMPSWAP_X2 opcode.
+    void
+    Inst_DS__DS_CMPST_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_CMPST_F64 class methods ---
+
+    Inst_DS__DS_CMPST_F64::Inst_DS__DS_CMPST_F64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_cmpst_f64")
+    {
+        setFlag(F64);
+    } // Inst_DS__DS_CMPST_F64
+
+    Inst_DS__DS_CMPST_F64::~Inst_DS__DS_CMPST_F64()
+    {
+    } // ~Inst_DS__DS_CMPST_F64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // src = DATA2;
+    // cmp = DATA;
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    // Floating point compare and store that handles NaN/INF/denormal values.
+    // Caution, the order of src and cmp are the *opposite* of the
+    // ---  BUFFER_ATOMIC_FCMPSWAP_X2 opcode.
+    void
+    Inst_DS__DS_CMPST_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_F64 class methods ---
+
+    Inst_DS__DS_MIN_F64::Inst_DS__DS_MIN_F64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_f64")
+    {
+        setFlag(F64);
+    } // Inst_DS__DS_MIN_F64
+
+    Inst_DS__DS_MIN_F64::~Inst_DS__DS_MIN_F64()
+    {
+    } // ~Inst_DS__DS_MIN_F64
+
+    // --- description from .arch file ---
+    // 64b.
+    // tmp = MEM[ADDR];
+    // src = DATA;
+    // cmp = DATA2;
+    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
+    // Floating point minimum that handles NaN/INF/denormal values.
+    // Note that this opcode is slightly more general-purpose than
+    // ---  BUFFER_ATOMIC_FMIN_X2.
+    void
+    Inst_DS__DS_MIN_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_F64 class methods ---
+
+    Inst_DS__DS_MAX_F64::Inst_DS__DS_MAX_F64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_f64")
+    {
+        setFlag(F64);
+    } // Inst_DS__DS_MAX_F64
+
+    Inst_DS__DS_MAX_F64::~Inst_DS__DS_MAX_F64()
+    {
+    } // ~Inst_DS__DS_MAX_F64
+
+    // --- description from .arch file ---
+    // 64b.
+    // tmp = MEM[ADDR];
+    // src = DATA;
+    // cmp = DATA2;
+    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
+    // Floating point maximum that handles NaN/INF/denormal values.
+    // Note that this opcode is slightly more general-purpose than
+    // ---  BUFFER_ATOMIC_FMAX_X2.
+    void
+    Inst_DS__DS_MAX_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_ADD_RTN_U64 class methods ---
+
+    Inst_DS__DS_ADD_RTN_U64::Inst_DS__DS_ADD_RTN_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_add_rtn_u64")
+    {
+    } // Inst_DS__DS_ADD_RTN_U64
+
+    Inst_DS__DS_ADD_RTN_U64::~Inst_DS__DS_ADD_RTN_U64()
+    {
+    } // ~Inst_DS__DS_ADD_RTN_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] += DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_ADD_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_SUB_RTN_U64 class methods ---
+
+    Inst_DS__DS_SUB_RTN_U64::Inst_DS__DS_SUB_RTN_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_sub_rtn_u64")
+    {
+    } // Inst_DS__DS_SUB_RTN_U64
+
+    Inst_DS__DS_SUB_RTN_U64::~Inst_DS__DS_SUB_RTN_U64()
+    {
+    } // ~Inst_DS__DS_SUB_RTN_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_SUB_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_RSUB_RTN_U64 class methods ---
+
+    Inst_DS__DS_RSUB_RTN_U64::Inst_DS__DS_RSUB_RTN_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_rsub_rtn_u64")
+    {
+    } // Inst_DS__DS_RSUB_RTN_U64
+
+    Inst_DS__DS_RSUB_RTN_U64::~Inst_DS__DS_RSUB_RTN_U64()
+    {
+    } // ~Inst_DS__DS_RSUB_RTN_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA - MEM[ADDR];
+    // RETURN_DATA = tmp.
+    // Subtraction with reversed operands.
+    void
+    Inst_DS__DS_RSUB_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_INC_RTN_U64 class methods ---
+
+    Inst_DS__DS_INC_RTN_U64::Inst_DS__DS_INC_RTN_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_inc_rtn_u64")
+    {
+    } // Inst_DS__DS_INC_RTN_U64
+
+    Inst_DS__DS_INC_RTN_U64::~Inst_DS__DS_INC_RTN_U64()
+    {
+    } // ~Inst_DS__DS_INC_RTN_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_INC_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_DEC_RTN_U64 class methods ---
+
+    Inst_DS__DS_DEC_RTN_U64::Inst_DS__DS_DEC_RTN_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_dec_rtn_u64")
+    {
+    } // Inst_DS__DS_DEC_RTN_U64
+
+    Inst_DS__DS_DEC_RTN_U64::~Inst_DS__DS_DEC_RTN_U64()
+    {
+    } // ~Inst_DS__DS_DEC_RTN_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
+    // (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_DEC_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_RTN_I64 class methods ---
+
+    Inst_DS__DS_MIN_RTN_I64::Inst_DS__DS_MIN_RTN_I64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_rtn_i64")
+    {
+    } // Inst_DS__DS_MIN_RTN_I64
+
+    Inst_DS__DS_MIN_RTN_I64::~Inst_DS__DS_MIN_RTN_I64()
+    {
+    } // ~Inst_DS__DS_MIN_RTN_I64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_MIN_RTN_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_RTN_I64 class methods ---
+
+    Inst_DS__DS_MAX_RTN_I64::Inst_DS__DS_MAX_RTN_I64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_rtn_i64")
+    {
+    } // Inst_DS__DS_MAX_RTN_I64
+
+    Inst_DS__DS_MAX_RTN_I64::~Inst_DS__DS_MAX_RTN_I64()
+    {
+    } // ~Inst_DS__DS_MAX_RTN_I64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_MAX_RTN_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_RTN_U64 class methods ---
+
+    Inst_DS__DS_MIN_RTN_U64::Inst_DS__DS_MIN_RTN_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_rtn_u64")
+    {
+    } // Inst_DS__DS_MIN_RTN_U64
+
+    Inst_DS__DS_MIN_RTN_U64::~Inst_DS__DS_MIN_RTN_U64()
+    {
+    } // ~Inst_DS__DS_MIN_RTN_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_MIN_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_RTN_U64 class methods ---
+
+    Inst_DS__DS_MAX_RTN_U64::Inst_DS__DS_MAX_RTN_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_rtn_u64")
+    {
+    } // Inst_DS__DS_MAX_RTN_U64
+
+    Inst_DS__DS_MAX_RTN_U64::~Inst_DS__DS_MAX_RTN_U64()
+    {
+    } // ~Inst_DS__DS_MAX_RTN_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_MAX_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_AND_RTN_B64 class methods ---
+
+    Inst_DS__DS_AND_RTN_B64::Inst_DS__DS_AND_RTN_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_and_rtn_b64")
+    {
+    } // Inst_DS__DS_AND_RTN_B64
+
+    Inst_DS__DS_AND_RTN_B64::~Inst_DS__DS_AND_RTN_B64()
+    {
+    } // ~Inst_DS__DS_AND_RTN_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] &= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_AND_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_OR_RTN_B64 class methods ---
+
+    Inst_DS__DS_OR_RTN_B64::Inst_DS__DS_OR_RTN_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_or_rtn_b64")
+    {
+    } // Inst_DS__DS_OR_RTN_B64
+
+    Inst_DS__DS_OR_RTN_B64::~Inst_DS__DS_OR_RTN_B64()
+    {
+    } // ~Inst_DS__DS_OR_RTN_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] |= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_OR_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_XOR_RTN_B64 class methods ---
+
+    Inst_DS__DS_XOR_RTN_B64::Inst_DS__DS_XOR_RTN_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_xor_rtn_b64")
+    {
+    } // Inst_DS__DS_XOR_RTN_B64
+
+    Inst_DS__DS_XOR_RTN_B64::~Inst_DS__DS_XOR_RTN_B64()
+    {
+    } // ~Inst_DS__DS_XOR_RTN_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] ^= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_XOR_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MSKOR_RTN_B64 class methods ---
+
+    Inst_DS__DS_MSKOR_RTN_B64::Inst_DS__DS_MSKOR_RTN_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_mskor_rtn_b64")
+    {
+    } // Inst_DS__DS_MSKOR_RTN_B64
+
+    Inst_DS__DS_MSKOR_RTN_B64::~Inst_DS__DS_MSKOR_RTN_B64()
+    {
+    } // ~Inst_DS__DS_MSKOR_RTN_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
+    // RETURN_DATA = tmp.
+    // Masked dword OR, D0 contains the mask and D1 contains the new value.
+    void
+    Inst_DS__DS_MSKOR_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRXCHG_RTN_B64 class methods ---
+
+    Inst_DS__DS_WRXCHG_RTN_B64::Inst_DS__DS_WRXCHG_RTN_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_wrxchg_rtn_b64")
+    {
+    } // Inst_DS__DS_WRXCHG_RTN_B64
+
+    Inst_DS__DS_WRXCHG_RTN_B64::~Inst_DS__DS_WRXCHG_RTN_B64()
+    {
+    } // ~Inst_DS__DS_WRXCHG_RTN_B64
+
+    // --- description from .arch file ---
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA;
+    // RETURN_DATA = tmp.
+    // Write-exchange operation.
+    void
+    Inst_DS__DS_WRXCHG_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRXCHG2_RTN_B64 class methods ---
+
+    Inst_DS__DS_WRXCHG2_RTN_B64::Inst_DS__DS_WRXCHG2_RTN_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_wrxchg2_rtn_b64")
+    {
+    } // Inst_DS__DS_WRXCHG2_RTN_B64
+
+    Inst_DS__DS_WRXCHG2_RTN_B64::~Inst_DS__DS_WRXCHG2_RTN_B64()
+    {
+    } // ~Inst_DS__DS_WRXCHG2_RTN_B64
+
+    // --- description from .arch file ---
+    // Write-exchange 2 separate qwords.
+    void
+    Inst_DS__DS_WRXCHG2_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRXCHG2ST64_RTN_B64 class methods ---
+
+    Inst_DS__DS_WRXCHG2ST64_RTN_B64::Inst_DS__DS_WRXCHG2ST64_RTN_B64(
+          InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_wrxchg2st64_rtn_b64")
+    {
+    } // Inst_DS__DS_WRXCHG2ST64_RTN_B64
+
+    Inst_DS__DS_WRXCHG2ST64_RTN_B64::~Inst_DS__DS_WRXCHG2ST64_RTN_B64()
+    {
+    } // ~Inst_DS__DS_WRXCHG2ST64_RTN_B64
+
+    // --- description from .arch file ---
+    // Write-exchange 2 qwords with a stride of 64 qwords.
+    void
+    Inst_DS__DS_WRXCHG2ST64_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_CMPST_RTN_B64 class methods ---
+
+    Inst_DS__DS_CMPST_RTN_B64::Inst_DS__DS_CMPST_RTN_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_cmpst_rtn_b64")
+    {
+    } // Inst_DS__DS_CMPST_RTN_B64
+
+    Inst_DS__DS_CMPST_RTN_B64::~Inst_DS__DS_CMPST_RTN_B64()
+    {
+    } // ~Inst_DS__DS_CMPST_RTN_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // src = DATA2;
+    // cmp = DATA;
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    // Compare and store.
+    // Caution, the order of src and cmp are the *opposite* of the
+    // ---  BUFFER_ATOMIC_CMPSWAP_X2 opcode.
+    void
+    Inst_DS__DS_CMPST_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_CMPST_RTN_F64 class methods ---
+
+    Inst_DS__DS_CMPST_RTN_F64::Inst_DS__DS_CMPST_RTN_F64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_cmpst_rtn_f64")
+    {
+        setFlag(F64);
+    } // Inst_DS__DS_CMPST_RTN_F64
+
+    Inst_DS__DS_CMPST_RTN_F64::~Inst_DS__DS_CMPST_RTN_F64()
+    {
+    } // ~Inst_DS__DS_CMPST_RTN_F64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // src = DATA2;
+    // cmp = DATA;
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    // Floating point compare and store that handles NaN/INF/denormal values.
+    // Caution, the order of src and cmp are the *opposite* of the
+    // ---  BUFFER_ATOMIC_FCMPSWAP_X2 opcode.
+    void
+    Inst_DS__DS_CMPST_RTN_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_RTN_F64 class methods ---
+
+    Inst_DS__DS_MIN_RTN_F64::Inst_DS__DS_MIN_RTN_F64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_rtn_f64")
+    {
+        setFlag(F64);
+    } // Inst_DS__DS_MIN_RTN_F64
+
+    Inst_DS__DS_MIN_RTN_F64::~Inst_DS__DS_MIN_RTN_F64()
+    {
+    } // ~Inst_DS__DS_MIN_RTN_F64
+
+    // --- description from .arch file ---
+    // 64b.
+    // tmp = MEM[ADDR];
+    // src = DATA;
+    // cmp = DATA2;
+    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
+    // Floating point minimum that handles NaN/INF/denormal values.
+    // Note that this opcode is slightly more general-purpose than
+    // ---  BUFFER_ATOMIC_FMIN_X2.
+    void
+    Inst_DS__DS_MIN_RTN_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_RTN_F64 class methods ---
+
+    Inst_DS__DS_MAX_RTN_F64::Inst_DS__DS_MAX_RTN_F64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_rtn_f64")
+    {
+        setFlag(F64);
+    } // Inst_DS__DS_MAX_RTN_F64
+
+    Inst_DS__DS_MAX_RTN_F64::~Inst_DS__DS_MAX_RTN_F64()
+    {
+    } // ~Inst_DS__DS_MAX_RTN_F64
+
+    // --- description from .arch file ---
+    // 64b.
+    // tmp = MEM[ADDR];
+    // src = DATA;
+    // cmp = DATA2;
+    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
+    // Floating point maximum that handles NaN/INF/denormal values.
+    // Note that this opcode is slightly more general-purpose than
+    // ---  BUFFER_ATOMIC_FMAX_X2.
+    void
+    Inst_DS__DS_MAX_RTN_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_READ_B64 class methods ---
+
+    Inst_DS__DS_READ_B64::Inst_DS__DS_READ_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read_b64")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ_B64
+
+    Inst_DS__DS_READ_B64::~Inst_DS__DS_READ_B64()
+    {
+    } // ~Inst_DS__DS_READ_B64
+
+    // --- description from .arch file ---
+    // RETURN_DATA = MEM[ADDR].
+    // Read 1 qword.
+    void
+    Inst_DS__DS_READ_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemRead<VecElemU64>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ_B64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU64 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst[lane] = (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane];
+            }
+        }
+
+        vdst.write();
+    } // completeAcc
+    // --- Inst_DS__DS_READ2_B64 class methods ---
+
+    Inst_DS__DS_READ2_B64::Inst_DS__DS_READ2_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read2_b64")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ2_B64
+
+    Inst_DS__DS_READ2_B64::~Inst_DS__DS_READ2_B64()
+    {
+    } // ~Inst_DS__DS_READ2_B64
+
+    // --- description from .arch file ---
+    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 8];
+    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 8].
+    // Read 2 qwords.
+    void
+    Inst_DS__DS_READ2_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ2_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0 * 8;
+        Addr offset1 = instData.OFFSET1 * 8;
+
+        initDualMemRead<VecElemU64>(gpuDynInst, offset0, offset1);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ2_B64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU64 vdst0(gpuDynInst, extData.VDST);
+        VecOperandU64 vdst1(gpuDynInst, extData.VDST + 2);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst0[lane] = (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane * 2];
+                vdst1[lane] = (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane * 2 + 1];
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+    } // completeAcc
+    // --- Inst_DS__DS_READ2ST64_B64 class methods ---
+
+    Inst_DS__DS_READ2ST64_B64::Inst_DS__DS_READ2ST64_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read2st64_b64")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ2ST64_B64
+
+    Inst_DS__DS_READ2ST64_B64::~Inst_DS__DS_READ2ST64_B64()
+    {
+    } // ~Inst_DS__DS_READ2ST64_B64
+
+    // --- description from .arch file ---
+    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 8 * 64];
+    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 8 * 64].
+    // Read 2 qwords.
+    void
+    Inst_DS__DS_READ2ST64_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ2ST64_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = (instData.OFFSET0 * 8 * 64);
+        Addr offset1 = (instData.OFFSET1 * 8 * 64);
+
+        initDualMemRead<VecElemU64>(gpuDynInst, offset0, offset1);
+    }
+
+    void
+    Inst_DS__DS_READ2ST64_B64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU64 vdst0(gpuDynInst, extData.VDST);
+        VecOperandU64 vdst1(gpuDynInst, extData.VDST + 2);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst0[lane] = (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane * 2];
+                vdst1[lane] = (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane * 2 + 1];
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+    }
+    // --- Inst_DS__DS_CONDXCHG32_RTN_B64 class methods ---
+
+    Inst_DS__DS_CONDXCHG32_RTN_B64::Inst_DS__DS_CONDXCHG32_RTN_B64(
+          InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_condxchg32_rtn_b64")
+    {
+    } // Inst_DS__DS_CONDXCHG32_RTN_B64
+
+    Inst_DS__DS_CONDXCHG32_RTN_B64::~Inst_DS__DS_CONDXCHG32_RTN_B64()
+    {
+    } // ~Inst_DS__DS_CONDXCHG32_RTN_B64
+
+    // --- description from .arch file ---
+    // Conditional write exchange.
+    void
+    Inst_DS__DS_CONDXCHG32_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_ADD_SRC2_U32 class methods ---
+
+    Inst_DS__DS_ADD_SRC2_U32::Inst_DS__DS_ADD_SRC2_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_add_src2_u32")
+    {
+    } // Inst_DS__DS_ADD_SRC2_U32
+
+    Inst_DS__DS_ADD_SRC2_U32::~Inst_DS__DS_ADD_SRC2_U32()
+    {
+    } // ~Inst_DS__DS_ADD_SRC2_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] + MEM[B].
+    void
+    Inst_DS__DS_ADD_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_SUB_SRC2_U32 class methods ---
+
+    Inst_DS__DS_SUB_SRC2_U32::Inst_DS__DS_SUB_SRC2_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_sub_src2_u32")
+    {
+    } // Inst_DS__DS_SUB_SRC2_U32
+
+    Inst_DS__DS_SUB_SRC2_U32::~Inst_DS__DS_SUB_SRC2_U32()
+    {
+    } // ~Inst_DS__DS_SUB_SRC2_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] - MEM[B].
+    void
+    Inst_DS__DS_SUB_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_RSUB_SRC2_U32 class methods ---
+
+    Inst_DS__DS_RSUB_SRC2_U32::Inst_DS__DS_RSUB_SRC2_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_rsub_src2_u32")
+    {
+    } // Inst_DS__DS_RSUB_SRC2_U32
+
+    Inst_DS__DS_RSUB_SRC2_U32::~Inst_DS__DS_RSUB_SRC2_U32()
+    {
+    } // ~Inst_DS__DS_RSUB_SRC2_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[B] - MEM[A].
+    void
+    Inst_DS__DS_RSUB_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_INC_SRC2_U32 class methods ---
+
+    Inst_DS__DS_INC_SRC2_U32::Inst_DS__DS_INC_SRC2_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_inc_src2_u32")
+    {
+    } // Inst_DS__DS_INC_SRC2_U32
+
+    Inst_DS__DS_INC_SRC2_U32::~Inst_DS__DS_INC_SRC2_U32()
+    {
+    } // ~Inst_DS__DS_INC_SRC2_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = (MEM[A] >= MEM[B] ? 0 : MEM[A] + 1).
+    void
+    Inst_DS__DS_INC_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_DEC_SRC2_U32 class methods ---
+
+    Inst_DS__DS_DEC_SRC2_U32::Inst_DS__DS_DEC_SRC2_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_dec_src2_u32")
+    {
+    } // Inst_DS__DS_DEC_SRC2_U32
+
+    Inst_DS__DS_DEC_SRC2_U32::~Inst_DS__DS_DEC_SRC2_U32()
+    {
+    } // ~Inst_DS__DS_DEC_SRC2_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = (MEM[A] == 0 || MEM[A] > MEM[B] ? MEM[B] : MEM[A] - 1).
+    // Uint decrement.
+    void
+    Inst_DS__DS_DEC_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_SRC2_I32 class methods ---
+
+    Inst_DS__DS_MIN_SRC2_I32::Inst_DS__DS_MIN_SRC2_I32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_src2_i32")
+    {
+    } // Inst_DS__DS_MIN_SRC2_I32
+
+    Inst_DS__DS_MIN_SRC2_I32::~Inst_DS__DS_MIN_SRC2_I32()
+    {
+    } // ~Inst_DS__DS_MIN_SRC2_I32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = min(MEM[A], MEM[B]).
+    void
+    Inst_DS__DS_MIN_SRC2_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_SRC2_I32 class methods ---
+
+    Inst_DS__DS_MAX_SRC2_I32::Inst_DS__DS_MAX_SRC2_I32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_src2_i32")
+    {
+    } // Inst_DS__DS_MAX_SRC2_I32
+
+    Inst_DS__DS_MAX_SRC2_I32::~Inst_DS__DS_MAX_SRC2_I32()
+    {
+    } // ~Inst_DS__DS_MAX_SRC2_I32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = max(MEM[A], MEM[B]).
+    void
+    Inst_DS__DS_MAX_SRC2_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_SRC2_U32 class methods ---
+
+    Inst_DS__DS_MIN_SRC2_U32::Inst_DS__DS_MIN_SRC2_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_src2_u32")
+    {
+    } // Inst_DS__DS_MIN_SRC2_U32
+
+    Inst_DS__DS_MIN_SRC2_U32::~Inst_DS__DS_MIN_SRC2_U32()
+    {
+    } // ~Inst_DS__DS_MIN_SRC2_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = min(MEM[A], MEM[B]).
+    void
+    Inst_DS__DS_MIN_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_SRC2_U32 class methods ---
+
+    Inst_DS__DS_MAX_SRC2_U32::Inst_DS__DS_MAX_SRC2_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_src2_u32")
+    {
+    } // Inst_DS__DS_MAX_SRC2_U32
+
+    Inst_DS__DS_MAX_SRC2_U32::~Inst_DS__DS_MAX_SRC2_U32()
+    {
+    } // ~Inst_DS__DS_MAX_SRC2_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = max(MEM[A], MEM[B]).
+    void
+    Inst_DS__DS_MAX_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_AND_SRC2_B32 class methods ---
+
+    Inst_DS__DS_AND_SRC2_B32::Inst_DS__DS_AND_SRC2_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_and_src2_b32")
+    {
+    } // Inst_DS__DS_AND_SRC2_B32
+
+    Inst_DS__DS_AND_SRC2_B32::~Inst_DS__DS_AND_SRC2_B32()
+    {
+    } // ~Inst_DS__DS_AND_SRC2_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] & MEM[B].
+    void
+    Inst_DS__DS_AND_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_OR_SRC2_B32 class methods ---
+
+    Inst_DS__DS_OR_SRC2_B32::Inst_DS__DS_OR_SRC2_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_or_src2_b32")
+    {
+    } // Inst_DS__DS_OR_SRC2_B32
+
+    Inst_DS__DS_OR_SRC2_B32::~Inst_DS__DS_OR_SRC2_B32()
+    {
+    } // ~Inst_DS__DS_OR_SRC2_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] | MEM[B].
+    void
+    Inst_DS__DS_OR_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_XOR_SRC2_B32 class methods ---
+
+    Inst_DS__DS_XOR_SRC2_B32::Inst_DS__DS_XOR_SRC2_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_xor_src2_b32")
+    {
+    } // Inst_DS__DS_XOR_SRC2_B32
+
+    Inst_DS__DS_XOR_SRC2_B32::~Inst_DS__DS_XOR_SRC2_B32()
+    {
+    } // ~Inst_DS__DS_XOR_SRC2_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] ^ MEM[B].
+    void
+    Inst_DS__DS_XOR_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRITE_SRC2_B32 class methods ---
+
+    Inst_DS__DS_WRITE_SRC2_B32::Inst_DS__DS_WRITE_SRC2_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write_src2_b32")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE_SRC2_B32
+
+    Inst_DS__DS_WRITE_SRC2_B32::~Inst_DS__DS_WRITE_SRC2_B32()
+    {
+    } // ~Inst_DS__DS_WRITE_SRC2_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[B].
+    // Write dword.
+    void
+    Inst_DS__DS_WRITE_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_SRC2_F32 class methods ---
+
+    Inst_DS__DS_MIN_SRC2_F32::Inst_DS__DS_MIN_SRC2_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_src2_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_MIN_SRC2_F32
+
+    Inst_DS__DS_MIN_SRC2_F32::~Inst_DS__DS_MIN_SRC2_F32()
+    {
+    } // ~Inst_DS__DS_MIN_SRC2_F32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = (MEM[B] < MEM[A]) ? MEM[B] : MEM[A].
+    // Float, handles NaN/INF/denorm.
+    void
+    Inst_DS__DS_MIN_SRC2_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_SRC2_F32 class methods ---
+
+    Inst_DS__DS_MAX_SRC2_F32::Inst_DS__DS_MAX_SRC2_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_src2_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_MAX_SRC2_F32
+
+    Inst_DS__DS_MAX_SRC2_F32::~Inst_DS__DS_MAX_SRC2_F32()
+    {
+    } // ~Inst_DS__DS_MAX_SRC2_F32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = (MEM[B] > MEM[A]) ? MEM[B] : MEM[A].
+    // Float, handles NaN/INF/denorm.
+    void
+    Inst_DS__DS_MAX_SRC2_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_ADD_SRC2_F32 class methods ---
+
+    Inst_DS__DS_ADD_SRC2_F32::Inst_DS__DS_ADD_SRC2_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_add_src2_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_ADD_SRC2_F32
+
+    Inst_DS__DS_ADD_SRC2_F32::~Inst_DS__DS_ADD_SRC2_F32()
+    {
+    } // ~Inst_DS__DS_ADD_SRC2_F32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[B] + MEM[A].
+    // Float, handles NaN/INF/denorm.
+    void
+    Inst_DS__DS_ADD_SRC2_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_GWS_SEMA_RELEASE_ALL class methods ---
+
+    Inst_DS__DS_GWS_SEMA_RELEASE_ALL::Inst_DS__DS_GWS_SEMA_RELEASE_ALL(
+          InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_gws_sema_release_all")
+    {
+    } // Inst_DS__DS_GWS_SEMA_RELEASE_ALL
+
+    Inst_DS__DS_GWS_SEMA_RELEASE_ALL::~Inst_DS__DS_GWS_SEMA_RELEASE_ALL()
+    {
+    } // ~Inst_DS__DS_GWS_SEMA_RELEASE_ALL
+
+    // --- description from .arch file ---
+    // GDS Only: The GWS resource (rid) indicated will process this opcode by
+    // updating the counter and labeling the specified resource as a semaphore.
+    // //Determine the GWS resource to work on
+    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
+    // //Incr the state counter of the resource
+    // state.counter[rid] = state.wave_in_queue;
+    // state.type = SEMAPHORE;
+    // return rd_done; //release calling wave
+    // This action will release ALL queued waves; it Will have no effect if no
+    // ---  waves are present.
+    void
+    Inst_DS__DS_GWS_SEMA_RELEASE_ALL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_GWS_INIT class methods ---
+
+    Inst_DS__DS_GWS_INIT::Inst_DS__DS_GWS_INIT(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_gws_init")
+    {
+    } // Inst_DS__DS_GWS_INIT
+
+    Inst_DS__DS_GWS_INIT::~Inst_DS__DS_GWS_INIT()
+    {
+    } // ~Inst_DS__DS_GWS_INIT
+
+    // --- description from .arch file ---
+    // GDS Only: Initialize a barrier or semaphore resource.
+    // //Determine the GWS resource to work on
+    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
+    // //Get the value to use in init
+    // index = find_first_valid(vector mask)
+    // value = DATA[thread: index]
+    // //Set the state of the resource
+    // state.counter[rid] = lsb(value); //limit #waves
+    // state.flag[rid] = 0;
+    // return rd_done; //release calling wave
+    void
+    Inst_DS__DS_GWS_INIT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_GWS_SEMA_V class methods ---
+
+    Inst_DS__DS_GWS_SEMA_V::Inst_DS__DS_GWS_SEMA_V(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_gws_sema_v")
+    {
+    } // Inst_DS__DS_GWS_SEMA_V
+
+    Inst_DS__DS_GWS_SEMA_V::~Inst_DS__DS_GWS_SEMA_V()
+    {
+    } // ~Inst_DS__DS_GWS_SEMA_V
+
+    // --- description from .arch file ---
+    // GDS Only: The GWS resource indicated will process this opcode by
+    // updating the counter and labeling the resource as a semaphore.
+    // //Determine the GWS resource to work on
+    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
+    // //Incr the state counter of the resource
+    // state.counter[rid]++;
+    // state.type = SEMAPHORE;
+    // return rd_done; //release calling wave
+    // This action will release one waved if any are queued in this resource.
+    void
+    Inst_DS__DS_GWS_SEMA_V::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_GWS_SEMA_BR class methods ---
+
+    Inst_DS__DS_GWS_SEMA_BR::Inst_DS__DS_GWS_SEMA_BR(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_gws_sema_br")
+    {
+    } // Inst_DS__DS_GWS_SEMA_BR
+
+    Inst_DS__DS_GWS_SEMA_BR::~Inst_DS__DS_GWS_SEMA_BR()
+    {
+    } // ~Inst_DS__DS_GWS_SEMA_BR
+
+    // --- description from .arch file ---
+    // GDS Only: The GWS resource indicated will process this opcode by
+    // updating the counter by the bulk release delivered count and labeling
+    // the resource as a semaphore.
+    // //Determine the GWS resource to work on
+    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
+    // index =  find first valid (vector mask)
+    // count = DATA[thread: index];
+    // //Add count to the resource state counter
+    // state.counter[rid] += count;
+    // state.type = SEMAPHORE;
+    // return rd_done; //release calling wave
+    // This action will release count number of waves, immediately if queued,
+    // or as they arrive from the noted resource.
+    void
+    Inst_DS__DS_GWS_SEMA_BR::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_GWS_SEMA_P class methods ---
+
+    Inst_DS__DS_GWS_SEMA_P::Inst_DS__DS_GWS_SEMA_P(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_gws_sema_p")
+    {
+    } // Inst_DS__DS_GWS_SEMA_P
+
+    Inst_DS__DS_GWS_SEMA_P::~Inst_DS__DS_GWS_SEMA_P()
+    {
+    } // ~Inst_DS__DS_GWS_SEMA_P
+
+    // --- description from .arch file ---
+    // GDS Only: The GWS resource indicated will process this opcode by
+    // queueing it until counter enables a release and then decrementing the
+    // counter of the resource as a semaphore.
+    // //Determine the GWS resource to work on
+    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
+    // state.type = SEMAPHORE;
+    // ENQUEUE until(state[rid].counter > 0)
+    // state[rid].counter--;
+    // return rd_done
+    void
+    Inst_DS__DS_GWS_SEMA_P::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_GWS_BARRIER class methods ---
+
+    Inst_DS__DS_GWS_BARRIER::Inst_DS__DS_GWS_BARRIER(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_gws_barrier")
+    {
+    } // Inst_DS__DS_GWS_BARRIER
+
+    Inst_DS__DS_GWS_BARRIER::~Inst_DS__DS_GWS_BARRIER()
+    {
+    } // ~Inst_DS__DS_GWS_BARRIER
+
+    // --- description from .arch file ---
+    // GDS Only: The GWS resource indicated will process this opcode by
+    // queueing it until barrier is satisfied. The number of waves needed is
+    // passed in as DATA of first valid thread.
+    // //Determine the GWS resource to work on
+    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + OFFSET0[5:0];
+    // index =  find first valid (vector mask);
+    // value = DATA[thread: index];
+    // // Input Decision Machine
+    // state.type[rid] = BARRIER;
+    // if (state[rid].counter <= 0) {
+    //     thread[rid].flag = state[rid].flag;
+    //     ENQUEUE;
+    //     state[rid].flag = !state.flag;
+    //     state[rid].counter = value;
+    //     return rd_done;
+    // } else {
+    //     state[rid].counter--;
+    //     thread.flag = state[rid].flag;
+    //     ENQUEUE;
+    // }
+    // Since the waves deliver the count for the next barrier, this function
+    // can have a different size barrier for each occurrence.
+    // // Release Machine
+    // if (state.type == BARRIER) {
+    //     if (state.flag != thread.flag) {
+    //         return rd_done;
+    //     }
+    // }
+    void
+    Inst_DS__DS_GWS_BARRIER::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_CONSUME class methods ---
+
+    Inst_DS__DS_CONSUME::Inst_DS__DS_CONSUME(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_consume")
+    {
+    } // Inst_DS__DS_CONSUME
+
+    Inst_DS__DS_CONSUME::~Inst_DS__DS_CONSUME()
+    {
+    } // ~Inst_DS__DS_CONSUME
+
+    // --- description from .arch file ---
+    // LDS & GDS. Subtract (count_bits(exec_mask)) from the value stored in DS
+    // memory at (M0.base + instr_offset). Return the pre-operation value to
+    // VGPRs.
+    void
+    Inst_DS__DS_CONSUME::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_APPEND class methods ---
+
+    Inst_DS__DS_APPEND::Inst_DS__DS_APPEND(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_append")
+    {
+    } // Inst_DS__DS_APPEND
+
+    Inst_DS__DS_APPEND::~Inst_DS__DS_APPEND()
+    {
+    } // ~Inst_DS__DS_APPEND
+
+    // --- description from .arch file ---
+    // LDS & GDS. Add (count_bits(exec_mask)) to the value stored in DS memory
+    // at (M0.base + instr_offset). Return the pre-operation value to VGPRs.
+    void
+    Inst_DS__DS_APPEND::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_ORDERED_COUNT class methods ---
+
+    Inst_DS__DS_ORDERED_COUNT::Inst_DS__DS_ORDERED_COUNT(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_ordered_count")
+    {
+    } // Inst_DS__DS_ORDERED_COUNT
+
+    Inst_DS__DS_ORDERED_COUNT::~Inst_DS__DS_ORDERED_COUNT()
+    {
+    } // ~Inst_DS__DS_ORDERED_COUNT
+
+    // --- description from .arch file ---
+    // GDS-only. Add (count_bits(exec_mask)) to one of 4 dedicated
+    // ordered-count counters (aka 'packers'). Additional bits of instr.offset
+    // field are overloaded to hold packer-id, 'last'.
+    void
+    Inst_DS__DS_ORDERED_COUNT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_ADD_SRC2_U64 class methods ---
+
+    Inst_DS__DS_ADD_SRC2_U64::Inst_DS__DS_ADD_SRC2_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_add_src2_u64")
+    {
+    } // Inst_DS__DS_ADD_SRC2_U64
+
+    Inst_DS__DS_ADD_SRC2_U64::~Inst_DS__DS_ADD_SRC2_U64()
+    {
+    } // ~Inst_DS__DS_ADD_SRC2_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] + MEM[B].
+    void
+    Inst_DS__DS_ADD_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_SUB_SRC2_U64 class methods ---
+
+    Inst_DS__DS_SUB_SRC2_U64::Inst_DS__DS_SUB_SRC2_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_sub_src2_u64")
+    {
+    } // Inst_DS__DS_SUB_SRC2_U64
+
+    Inst_DS__DS_SUB_SRC2_U64::~Inst_DS__DS_SUB_SRC2_U64()
+    {
+    } // ~Inst_DS__DS_SUB_SRC2_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] - MEM[B].
+    void
+    Inst_DS__DS_SUB_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_RSUB_SRC2_U64 class methods ---
+
+    Inst_DS__DS_RSUB_SRC2_U64::Inst_DS__DS_RSUB_SRC2_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_rsub_src2_u64")
+    {
+    } // Inst_DS__DS_RSUB_SRC2_U64
+
+    Inst_DS__DS_RSUB_SRC2_U64::~Inst_DS__DS_RSUB_SRC2_U64()
+    {
+    } // ~Inst_DS__DS_RSUB_SRC2_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[B] - MEM[A].
+    void
+    Inst_DS__DS_RSUB_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_INC_SRC2_U64 class methods ---
+
+    Inst_DS__DS_INC_SRC2_U64::Inst_DS__DS_INC_SRC2_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_inc_src2_u64")
+    {
+    } // Inst_DS__DS_INC_SRC2_U64
+
+    Inst_DS__DS_INC_SRC2_U64::~Inst_DS__DS_INC_SRC2_U64()
+    {
+    } // ~Inst_DS__DS_INC_SRC2_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = (MEM[A] >= MEM[B] ? 0 : MEM[A] + 1).
+    void
+    Inst_DS__DS_INC_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_DEC_SRC2_U64 class methods ---
+
+    Inst_DS__DS_DEC_SRC2_U64::Inst_DS__DS_DEC_SRC2_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_dec_src2_u64")
+    {
+    } // Inst_DS__DS_DEC_SRC2_U64
+
+    Inst_DS__DS_DEC_SRC2_U64::~Inst_DS__DS_DEC_SRC2_U64()
+    {
+    } // ~Inst_DS__DS_DEC_SRC2_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = (MEM[A] == 0 || MEM[A] > MEM[B] ? MEM[B] : MEM[A] - 1).
+    // Uint decrement.
+    void
+    Inst_DS__DS_DEC_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_SRC2_I64 class methods ---
+
+    Inst_DS__DS_MIN_SRC2_I64::Inst_DS__DS_MIN_SRC2_I64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_src2_i64")
+    {
+    } // Inst_DS__DS_MIN_SRC2_I64
+
+    Inst_DS__DS_MIN_SRC2_I64::~Inst_DS__DS_MIN_SRC2_I64()
+    {
+    } // ~Inst_DS__DS_MIN_SRC2_I64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = min(MEM[A], MEM[B]).
+    void
+    Inst_DS__DS_MIN_SRC2_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_SRC2_I64 class methods ---
+
+    Inst_DS__DS_MAX_SRC2_I64::Inst_DS__DS_MAX_SRC2_I64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_src2_i64")
+    {
+    } // Inst_DS__DS_MAX_SRC2_I64
+
+    Inst_DS__DS_MAX_SRC2_I64::~Inst_DS__DS_MAX_SRC2_I64()
+    {
+    } // ~Inst_DS__DS_MAX_SRC2_I64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = max(MEM[A], MEM[B]).
+    void
+    Inst_DS__DS_MAX_SRC2_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_SRC2_U64 class methods ---
+
+    Inst_DS__DS_MIN_SRC2_U64::Inst_DS__DS_MIN_SRC2_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_src2_u64")
+    {
+    } // Inst_DS__DS_MIN_SRC2_U64
+
+    Inst_DS__DS_MIN_SRC2_U64::~Inst_DS__DS_MIN_SRC2_U64()
+    {
+    } // ~Inst_DS__DS_MIN_SRC2_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = min(MEM[A], MEM[B]).
+    void
+    Inst_DS__DS_MIN_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_SRC2_U64 class methods ---
+
+    Inst_DS__DS_MAX_SRC2_U64::Inst_DS__DS_MAX_SRC2_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_src2_u64")
+    {
+    } // Inst_DS__DS_MAX_SRC2_U64
+
+    Inst_DS__DS_MAX_SRC2_U64::~Inst_DS__DS_MAX_SRC2_U64()
+    {
+    } // ~Inst_DS__DS_MAX_SRC2_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = max(MEM[A], MEM[B]).
+    void
+    Inst_DS__DS_MAX_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_AND_SRC2_B64 class methods ---
+
+    Inst_DS__DS_AND_SRC2_B64::Inst_DS__DS_AND_SRC2_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_and_src2_b64")
+    {
+    } // Inst_DS__DS_AND_SRC2_B64
+
+    Inst_DS__DS_AND_SRC2_B64::~Inst_DS__DS_AND_SRC2_B64()
+    {
+    } // ~Inst_DS__DS_AND_SRC2_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] & MEM[B].
+    void
+    Inst_DS__DS_AND_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_OR_SRC2_B64 class methods ---
+
+    Inst_DS__DS_OR_SRC2_B64::Inst_DS__DS_OR_SRC2_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_or_src2_b64")
+    {
+    } // Inst_DS__DS_OR_SRC2_B64
+
+    Inst_DS__DS_OR_SRC2_B64::~Inst_DS__DS_OR_SRC2_B64()
+    {
+    } // ~Inst_DS__DS_OR_SRC2_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] | MEM[B].
+    void
+    Inst_DS__DS_OR_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_XOR_SRC2_B64 class methods ---
+
+    Inst_DS__DS_XOR_SRC2_B64::Inst_DS__DS_XOR_SRC2_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_xor_src2_b64")
+    {
+    } // Inst_DS__DS_XOR_SRC2_B64
+
+    Inst_DS__DS_XOR_SRC2_B64::~Inst_DS__DS_XOR_SRC2_B64()
+    {
+    } // ~Inst_DS__DS_XOR_SRC2_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] ^ MEM[B].
+    void
+    Inst_DS__DS_XOR_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRITE_SRC2_B64 class methods ---
+
+    Inst_DS__DS_WRITE_SRC2_B64::Inst_DS__DS_WRITE_SRC2_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write_src2_b64")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE_SRC2_B64
+
+    Inst_DS__DS_WRITE_SRC2_B64::~Inst_DS__DS_WRITE_SRC2_B64()
+    {
+    } // ~Inst_DS__DS_WRITE_SRC2_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[B].
+    // Write qword.
+    void
+    Inst_DS__DS_WRITE_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_SRC2_F64 class methods ---
+
+    Inst_DS__DS_MIN_SRC2_F64::Inst_DS__DS_MIN_SRC2_F64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_src2_f64")
+    {
+        setFlag(F64);
+    } // Inst_DS__DS_MIN_SRC2_F64
+
+    Inst_DS__DS_MIN_SRC2_F64::~Inst_DS__DS_MIN_SRC2_F64()
+    {
+    } // ~Inst_DS__DS_MIN_SRC2_F64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = (MEM[B] < MEM[A]) ? MEM[B] : MEM[A].
+    // Float, handles NaN/INF/denorm.
+    void
+    Inst_DS__DS_MIN_SRC2_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_SRC2_F64 class methods ---
+
+    Inst_DS__DS_MAX_SRC2_F64::Inst_DS__DS_MAX_SRC2_F64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_src2_f64")
+    {
+        setFlag(F64);
+    } // Inst_DS__DS_MAX_SRC2_F64
+
+    Inst_DS__DS_MAX_SRC2_F64::~Inst_DS__DS_MAX_SRC2_F64()
+    {
+    } // ~Inst_DS__DS_MAX_SRC2_F64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = (MEM[B] > MEM[A]) ? MEM[B] : MEM[A].
+    // Float, handles NaN/INF/denorm.
+    void
+    Inst_DS__DS_MAX_SRC2_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRITE_B96 class methods ---
+
+    Inst_DS__DS_WRITE_B96::Inst_DS__DS_WRITE_B96(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write_b96")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE_B96
+
+    Inst_DS__DS_WRITE_B96::~Inst_DS__DS_WRITE_B96()
+    {
+    } // ~Inst_DS__DS_WRITE_B96
+
+    // --- description from .arch file ---
+    // {MEM[ADDR + 8], MEM[ADDR + 4], MEM[ADDR]} = DATA[95:0].
+    // Tri-dword write.
+    void
+    Inst_DS__DS_WRITE_B96::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
+        ConstVecOperandU32 data1(gpuDynInst, extData.DATA0 + 1);
+        ConstVecOperandU32 data2(gpuDynInst, extData.DATA0 + 2);
+
+        addr.read();
+        data0.read();
+        data1.read();
+        data2.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4] = data0[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE_B96::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemWrite<3>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_WRITE_B96::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_WRITE_B128 class methods ---
+
+    Inst_DS__DS_WRITE_B128::Inst_DS__DS_WRITE_B128(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write_b128")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE_B128
+
+    Inst_DS__DS_WRITE_B128::~Inst_DS__DS_WRITE_B128()
+    {
+    } // ~Inst_DS__DS_WRITE_B128
+
+    // --- description from .arch file ---
+    // {MEM[ADDR + 12], MEM[ADDR + 8], MEM[ADDR + 4], MEM[ADDR]} = DATA[127:0].
+    // Qword write.
+    void
+    Inst_DS__DS_WRITE_B128::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
+        ConstVecOperandU32 data1(gpuDynInst, extData.DATA0 + 1);
+        ConstVecOperandU32 data2(gpuDynInst, extData.DATA0 + 2);
+        ConstVecOperandU32 data3(gpuDynInst, extData.DATA0 + 3);
+
+        addr.read();
+        data0.read();
+        data1.read();
+        data2.read();
+        data3.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4] = data0[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 3] = data3[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE_B128::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemWrite<4>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_WRITE_B128::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_READ_B96 class methods ---
+
+    Inst_DS__DS_READ_B96::Inst_DS__DS_READ_B96(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read_b96")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ_B96
+
+    Inst_DS__DS_READ_B96::~Inst_DS__DS_READ_B96()
+    {
+    } // ~Inst_DS__DS_READ_B96
+
+    // --- description from .arch file ---
+    // Tri-dword read.
+    void
+    Inst_DS__DS_READ_B96::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ_B96::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemRead<3>(gpuDynInst, offset);
+    }
+
+    void
+    Inst_DS__DS_READ_B96::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
+        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4];
+                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 1];
+                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 2];
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+        vdst2.write();
+    }
+    // --- Inst_DS__DS_READ_B128 class methods ---
+
+    Inst_DS__DS_READ_B128::Inst_DS__DS_READ_B128(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read_b128")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ_B128
+
+    Inst_DS__DS_READ_B128::~Inst_DS__DS_READ_B128()
+    {
+    } // ~Inst_DS__DS_READ_B128
+
+    // --- description from .arch file ---
+    // Qword read.
+    void
+    Inst_DS__DS_READ_B128::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ_B128::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemRead<4>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ_B128::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
+        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
+        VecOperandU32 vdst3(gpuDynInst, extData.VDST + 3);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4];
+                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 1];
+                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 2];
+                vdst3[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 3];
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+        vdst2.write();
+        vdst3.write();
+    } // completeAcc
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/exp.cc b/src/arch/amdgpu/vega/insts/exp.cc
new file mode 100644
index 0000000000..31b6ded10f
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/exp.cc
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_EXP__EXP class methods ---
+
+    Inst_EXP__EXP::Inst_EXP__EXP(InFmt_EXP *iFmt)
+        : Inst_EXP(iFmt, "exp")
+    {
+    } // Inst_EXP__EXP
+
+    Inst_EXP__EXP::~Inst_EXP__EXP()
+    {
+    } // ~Inst_EXP__EXP
+
+    // --- description from .arch file ---
+    // Export through SX.
+    void
+    Inst_EXP__EXP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/flat.cc b/src/arch/amdgpu/vega/insts/flat.cc
new file mode 100644
index 0000000000..7f79025b3f
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/flat.cc
@@ -0,0 +1,2138 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_FLAT__FLAT_LOAD_UBYTE class methods ---
+
+    Inst_FLAT__FLAT_LOAD_UBYTE::Inst_FLAT__FLAT_LOAD_UBYTE(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_load_ubyte")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_FLAT__FLAT_LOAD_UBYTE
+
+    Inst_FLAT__FLAT_LOAD_UBYTE::~Inst_FLAT__FLAT_LOAD_UBYTE()
+    {
+    } // ~Inst_FLAT__FLAT_LOAD_UBYTE
+
+    // --- description from .arch file ---
+    // Untyped buffer load unsigned byte (zero extend to VGPR destination).
+    void
+    Inst_FLAT__FLAT_LOAD_UBYTE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_LOAD_UBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<VecElemU8>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_LOAD_UBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU8*>(
+                    gpuDynInst->d_data))[lane]);
+            }
+        }
+        vdst.write();
+    } // execute
+    // --- Inst_FLAT__FLAT_LOAD_SBYTE class methods ---
+
+    Inst_FLAT__FLAT_LOAD_SBYTE::Inst_FLAT__FLAT_LOAD_SBYTE(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_load_sbyte")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_FLAT__FLAT_LOAD_SBYTE
+
+    Inst_FLAT__FLAT_LOAD_SBYTE::~Inst_FLAT__FLAT_LOAD_SBYTE()
+    {
+    } // ~Inst_FLAT__FLAT_LOAD_SBYTE
+
+    // --- description from .arch file ---
+    // Untyped buffer load signed byte (sign extend to VGPR destination).
+    void
+    Inst_FLAT__FLAT_LOAD_SBYTE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_LOAD_SBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_LOAD_SBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_FLAT__FLAT_LOAD_USHORT class methods ---
+
+    Inst_FLAT__FLAT_LOAD_USHORT::Inst_FLAT__FLAT_LOAD_USHORT(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_load_ushort")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_FLAT__FLAT_LOAD_USHORT
+
+    Inst_FLAT__FLAT_LOAD_USHORT::~Inst_FLAT__FLAT_LOAD_USHORT()
+    {
+    } // ~Inst_FLAT__FLAT_LOAD_USHORT
+
+    // --- description from .arch file ---
+    // Untyped buffer load unsigned short (zero extend to VGPR destination).
+    void
+    Inst_FLAT__FLAT_LOAD_USHORT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_LOAD_USHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<VecElemU16>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_LOAD_USHORT::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU16*>(
+                    gpuDynInst->d_data))[lane]);
+            }
+        }
+        vdst.write();
+    } // execute
+
+    // --- Inst_FLAT__FLAT_LOAD_SSHORT class methods ---
+
+    Inst_FLAT__FLAT_LOAD_SSHORT::Inst_FLAT__FLAT_LOAD_SSHORT(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_load_sshort")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_FLAT__FLAT_LOAD_SSHORT
+
+    Inst_FLAT__FLAT_LOAD_SSHORT::~Inst_FLAT__FLAT_LOAD_SSHORT()
+    {
+    } // ~Inst_FLAT__FLAT_LOAD_SSHORT
+
+    // --- description from .arch file ---
+    // Untyped buffer load signed short (sign extend to VGPR destination).
+    void
+    Inst_FLAT__FLAT_LOAD_SSHORT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_LOAD_SSHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_LOAD_SSHORT::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_FLAT__FLAT_LOAD_DWORD class methods ---
+
+    Inst_FLAT__FLAT_LOAD_DWORD::Inst_FLAT__FLAT_LOAD_DWORD(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_load_dword")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_FLAT__FLAT_LOAD_DWORD
+
+    Inst_FLAT__FLAT_LOAD_DWORD::~Inst_FLAT__FLAT_LOAD_DWORD()
+    {
+    } // ~Inst_FLAT__FLAT_LOAD_DWORD
+
+    // --- description from .arch file ---
+    // Untyped buffer load dword.
+    void
+    Inst_FLAT__FLAT_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane];
+            }
+        }
+        vdst.write();
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_LOAD_DWORDX2 class methods ---
+
+    Inst_FLAT__FLAT_LOAD_DWORDX2::Inst_FLAT__FLAT_LOAD_DWORDX2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_load_dwordx2")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_FLAT__FLAT_LOAD_DWORDX2
+
+    Inst_FLAT__FLAT_LOAD_DWORDX2::~Inst_FLAT__FLAT_LOAD_DWORDX2()
+    {
+    } // ~Inst_FLAT__FLAT_LOAD_DWORDX2
+
+    // --- description from .arch file ---
+    // Untyped buffer load 2 dwords.
+    void
+    Inst_FLAT__FLAT_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU64 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst[lane] = (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane];
+            }
+        }
+        vdst.write();
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_LOAD_DWORDX3 class methods ---
+
+    Inst_FLAT__FLAT_LOAD_DWORDX3::Inst_FLAT__FLAT_LOAD_DWORDX3(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_load_dwordx3")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_FLAT__FLAT_LOAD_DWORDX3
+
+    Inst_FLAT__FLAT_LOAD_DWORDX3::~Inst_FLAT__FLAT_LOAD_DWORDX3()
+    {
+    } // ~Inst_FLAT__FLAT_LOAD_DWORDX3
+
+    // --- description from .arch file ---
+    // Untyped buffer load 3 dwords.
+    void
+    Inst_FLAT__FLAT_LOAD_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_LOAD_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<3>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_LOAD_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
+        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 3];
+                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 3 + 1];
+                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 3 + 2];
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+        vdst2.write();
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_LOAD_DWORDX4 class methods ---
+
+    Inst_FLAT__FLAT_LOAD_DWORDX4::Inst_FLAT__FLAT_LOAD_DWORDX4(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_load_dwordx4")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_FLAT__FLAT_LOAD_DWORDX4
+
+    Inst_FLAT__FLAT_LOAD_DWORDX4::~Inst_FLAT__FLAT_LOAD_DWORDX4()
+    {
+    } // ~Inst_FLAT__FLAT_LOAD_DWORDX4
+
+    // --- description from .arch file ---
+    // Untyped buffer load 4 dwords.
+    void
+    Inst_FLAT__FLAT_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<4>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
+        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
+        VecOperandU32 vdst3(gpuDynInst, extData.VDST + 3);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4];
+                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 1];
+                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 2];
+                vdst3[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 3];
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+        vdst2.write();
+        vdst3.write();
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_STORE_BYTE class methods ---
+
+    Inst_FLAT__FLAT_STORE_BYTE::Inst_FLAT__FLAT_STORE_BYTE(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_store_byte")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_FLAT__FLAT_STORE_BYTE
+
+    Inst_FLAT__FLAT_STORE_BYTE::~Inst_FLAT__FLAT_STORE_BYTE()
+    {
+    } // ~Inst_FLAT__FLAT_STORE_BYTE
+
+    // --- description from .arch file ---
+    // Untyped buffer store byte.
+    void
+    Inst_FLAT__FLAT_STORE_BYTE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU8 data(gpuDynInst, extData.DATA);
+
+        data.read();
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU8*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_STORE_BYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<VecElemU8>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_STORE_BYTE::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_FLAT__FLAT_STORE_SHORT class methods ---
+
+    Inst_FLAT__FLAT_STORE_SHORT::Inst_FLAT__FLAT_STORE_SHORT(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_store_short")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_FLAT__FLAT_STORE_SHORT
+
+    Inst_FLAT__FLAT_STORE_SHORT::~Inst_FLAT__FLAT_STORE_SHORT()
+    {
+    } // ~Inst_FLAT__FLAT_STORE_SHORT
+
+    // --- description from .arch file ---
+    // Untyped buffer store short.
+    void
+    Inst_FLAT__FLAT_STORE_SHORT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU16 data(gpuDynInst, extData.DATA);
+
+        data.read();
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU16*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_STORE_SHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<VecElemU16>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_STORE_SHORT::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_STORE_SHORT_D16_HI class methods ---
+
+    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::
+        Inst_FLAT__FLAT_STORE_SHORT_D16_HI(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_store_short_d16_hi")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_FLAT__FLAT_STORE_SHORT_D16_HI
+
+    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::~Inst_FLAT__FLAT_STORE_SHORT_D16_HI()
+    {
+    } // ~Inst_FLAT__FLAT_STORE_SHORT_D16_HI
+
+    // --- description from .arch file ---
+    // Untyped buffer store short.
+    void
+    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 data(gpuDynInst, extData.DATA);
+
+        data.read();
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU16*>(gpuDynInst->d_data))[lane]
+                    = (data[lane] >> 16);
+            }
+        }
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<VecElemU16>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_STORE_DWORD class methods ---
+
+    Inst_FLAT__FLAT_STORE_DWORD::Inst_FLAT__FLAT_STORE_DWORD(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_store_dword")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_FLAT__FLAT_STORE_DWORD
+
+    Inst_FLAT__FLAT_STORE_DWORD::~Inst_FLAT__FLAT_STORE_DWORD()
+    {
+    } // ~Inst_FLAT__FLAT_STORE_DWORD
+
+    // --- description from .arch file ---
+    // Untyped buffer store dword.
+    void
+    Inst_FLAT__FLAT_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 data(gpuDynInst, extData.DATA);
+
+        data.read();
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_STORE_DWORDX2 class methods ---
+
+    Inst_FLAT__FLAT_STORE_DWORDX2::Inst_FLAT__FLAT_STORE_DWORDX2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_store_dwordx2")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_FLAT__FLAT_STORE_DWORDX2
+
+    Inst_FLAT__FLAT_STORE_DWORDX2::~Inst_FLAT__FLAT_STORE_DWORDX2()
+    {
+    } // ~Inst_FLAT__FLAT_STORE_DWORDX2
+
+    // --- description from .arch file ---
+    // Untyped buffer store 2 dwords.
+    void
+    Inst_FLAT__FLAT_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU64 data(gpuDynInst, extData.DATA);
+
+        data.read();
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU64*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_STORE_DWORDX3 class methods ---
+
+    Inst_FLAT__FLAT_STORE_DWORDX3::Inst_FLAT__FLAT_STORE_DWORDX3(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_store_dwordx3")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_FLAT__FLAT_STORE_DWORDX3
+
+    Inst_FLAT__FLAT_STORE_DWORDX3::~Inst_FLAT__FLAT_STORE_DWORDX3()
+    {
+    } // ~Inst_FLAT__FLAT_STORE_DWORDX3
+
+    // --- description from .arch file ---
+    // Untyped buffer store 3 dwords.
+    void
+    Inst_FLAT__FLAT_STORE_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 data0(gpuDynInst, extData.DATA);
+        ConstVecOperandU32 data1(gpuDynInst, extData.DATA + 1);
+        ConstVecOperandU32 data2(gpuDynInst, extData.DATA + 2);
+
+        data0.read();
+        data1.read();
+        data2.read();
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 3] = data0[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 3 + 1] = data1[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 3 + 2] = data2[lane];
+            }
+        }
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_STORE_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<3>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_STORE_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_STORE_DWORDX4 class methods ---
+
+    Inst_FLAT__FLAT_STORE_DWORDX4::Inst_FLAT__FLAT_STORE_DWORDX4(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_store_dwordx4")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_FLAT__FLAT_STORE_DWORDX4
+
+    Inst_FLAT__FLAT_STORE_DWORDX4::~Inst_FLAT__FLAT_STORE_DWORDX4()
+    {
+    } // ~Inst_FLAT__FLAT_STORE_DWORDX4
+
+    // --- description from .arch file ---
+    // Untyped buffer store 4 dwords.
+    void
+    Inst_FLAT__FLAT_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 data0(gpuDynInst, extData.DATA);
+        ConstVecOperandU32 data1(gpuDynInst, extData.DATA + 1);
+        ConstVecOperandU32 data2(gpuDynInst, extData.DATA + 2);
+        ConstVecOperandU32 data3(gpuDynInst, extData.DATA + 3);
+
+        data0.read();
+        data1.read();
+        data2.read();
+        data3.read();
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4] = data0[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 3] = data3[lane];
+            }
+        }
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<4>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_SWAP class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_SWAP::Inst_FLAT__FLAT_ATOMIC_SWAP(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_swap")
+    {
+        setFlag(AtomicExch);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_SWAP
+
+    Inst_FLAT__FLAT_ATOMIC_SWAP::~Inst_FLAT__FLAT_ATOMIC_SWAP()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_SWAP
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_SWAP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SWAP::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SWAP::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+
+    // --- Inst_FLAT__FLAT_ATOMIC_CMPSWAP class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP
+        ::Inst_FLAT__FLAT_ATOMIC_CMPSWAP(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_cmpswap")
+    {
+        setFlag(AtomicCAS);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_CMPSWAP
+
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::~Inst_FLAT__FLAT_ATOMIC_CMPSWAP()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_CMPSWAP
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // src = DATA[0];
+    // cmp = DATA[1];
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32, 1>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_ADD class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_ADD::Inst_FLAT__FLAT_ATOMIC_ADD(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_add")
+    {
+        setFlag(AtomicAdd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_ADD
+
+    Inst_FLAT__FLAT_ATOMIC_ADD::~Inst_FLAT__FLAT_ATOMIC_ADD()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_ADD
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] += DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_SUB class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_SUB::Inst_FLAT__FLAT_ATOMIC_SUB(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_sub")
+    {
+        setFlag(AtomicSub);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_SUB
+
+    Inst_FLAT__FLAT_ATOMIC_SUB::~Inst_FLAT__FLAT_ATOMIC_SUB()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_SUB
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SUB::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SUB::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_SMIN class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_SMIN::Inst_FLAT__FLAT_ATOMIC_SMIN(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_smin")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_SMIN
+
+    Inst_FLAT__FLAT_ATOMIC_SMIN::~Inst_FLAT__FLAT_ATOMIC_SMIN()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_SMIN
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMIN::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandI32, VecElemI32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMIN::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemI32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMIN::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandI32, VecElemI32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_UMIN class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_UMIN::Inst_FLAT__FLAT_ATOMIC_UMIN(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_umin")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_UMIN
+
+    Inst_FLAT__FLAT_ATOMIC_UMIN::~Inst_FLAT__FLAT_ATOMIC_UMIN()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_UMIN
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMIN::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMIN::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMIN::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_SMAX class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_SMAX::Inst_FLAT__FLAT_ATOMIC_SMAX(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_smax")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_SMAX
+
+    Inst_FLAT__FLAT_ATOMIC_SMAX::~Inst_FLAT__FLAT_ATOMIC_SMAX()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_SMAX
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMAX::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandI32, VecElemI32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMAX::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemI32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMAX::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandI32, VecElemI32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_UMAX class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_UMAX::Inst_FLAT__FLAT_ATOMIC_UMAX(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_umax")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_UMAX
+
+    Inst_FLAT__FLAT_ATOMIC_UMAX::~Inst_FLAT__FLAT_ATOMIC_UMAX()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_UMAX
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMAX::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMAX::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMAX::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_AND class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_AND::Inst_FLAT__FLAT_ATOMIC_AND(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_and")
+    {
+        setFlag(AtomicAnd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_AND
+
+    Inst_FLAT__FLAT_ATOMIC_AND::~Inst_FLAT__FLAT_ATOMIC_AND()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_AND
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] &= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_AND::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_AND::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_AND::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_OR class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_OR::Inst_FLAT__FLAT_ATOMIC_OR(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_or")
+    {
+        setFlag(AtomicOr);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_OR
+
+    Inst_FLAT__FLAT_ATOMIC_OR::~Inst_FLAT__FLAT_ATOMIC_OR()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_OR
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] |= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_OR::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_OR::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_OR::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+
+    // --- Inst_FLAT__FLAT_ATOMIC_XOR class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_XOR::Inst_FLAT__FLAT_ATOMIC_XOR(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_xor")
+    {
+        setFlag(AtomicXor);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_XOR
+
+    Inst_FLAT__FLAT_ATOMIC_XOR::~Inst_FLAT__FLAT_ATOMIC_XOR()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_XOR
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] ^= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_XOR::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_XOR::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_XOR::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_INC class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_INC::Inst_FLAT__FLAT_ATOMIC_INC(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_inc")
+    {
+        setFlag(AtomicInc);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_INC
+
+    Inst_FLAT__FLAT_ATOMIC_INC::~Inst_FLAT__FLAT_ATOMIC_INC()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_INC
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_INC::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_INC::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_DEC class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_DEC::Inst_FLAT__FLAT_ATOMIC_DEC(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_dec")
+    {
+        setFlag(AtomicDec);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_DEC
+
+    Inst_FLAT__FLAT_ATOMIC_DEC::~Inst_FLAT__FLAT_ATOMIC_DEC()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_DEC
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
+    // (unsigned compare); RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_DEC::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_DEC::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_SWAP_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::Inst_FLAT__FLAT_ATOMIC_SWAP_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_swap_x2")
+    {
+        setFlag(AtomicExch);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_SWAP_X2
+
+    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::~Inst_FLAT__FLAT_ATOMIC_SWAP_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_SWAP_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_cmpswap_x2")
+    {
+        setFlag(AtomicCAS);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2
+
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::~Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // src = DATA[0:1];
+    // cmp = DATA[2:3];
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64, 2>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_ADD_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_ADD_X2::Inst_FLAT__FLAT_ATOMIC_ADD_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_add_x2")
+    {
+        setFlag(AtomicAdd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_ADD_X2
+
+    Inst_FLAT__FLAT_ATOMIC_ADD_X2::~Inst_FLAT__FLAT_ATOMIC_ADD_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_ADD_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] += DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_SUB_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_SUB_X2::Inst_FLAT__FLAT_ATOMIC_SUB_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_sub_x2")
+    {
+        setFlag(AtomicSub);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_SUB_X2
+
+    Inst_FLAT__FLAT_ATOMIC_SUB_X2::~Inst_FLAT__FLAT_ATOMIC_SUB_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_SUB_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_SUB_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SUB_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SUB_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_SMIN_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::Inst_FLAT__FLAT_ATOMIC_SMIN_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_smin_x2")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_SMIN_X2
+
+    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::~Inst_FLAT__FLAT_ATOMIC_SMIN_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_SMIN_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandI64, VecElemI64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemI64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandI64, VecElemI64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_UMIN_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::Inst_FLAT__FLAT_ATOMIC_UMIN_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_umin_x2")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_UMIN_X2
+
+    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::~Inst_FLAT__FLAT_ATOMIC_UMIN_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_UMIN_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_SMAX_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::Inst_FLAT__FLAT_ATOMIC_SMAX_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_smax_x2")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_SMAX_X2
+
+    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::~Inst_FLAT__FLAT_ATOMIC_SMAX_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_SMAX_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandI64, VecElemI64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemI64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandI64, VecElemI64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_UMAX_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::Inst_FLAT__FLAT_ATOMIC_UMAX_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_umax_x2")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_UMAX_X2
+
+    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::~Inst_FLAT__FLAT_ATOMIC_UMAX_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_UMAX_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_AND_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_AND_X2::Inst_FLAT__FLAT_ATOMIC_AND_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_and_x2")
+    {
+        setFlag(AtomicAnd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_AND_X2
+
+    Inst_FLAT__FLAT_ATOMIC_AND_X2::~Inst_FLAT__FLAT_ATOMIC_AND_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_AND_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] &= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_AND_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_AND_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_AND_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_OR_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_OR_X2::Inst_FLAT__FLAT_ATOMIC_OR_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_or_x2")
+    {
+        setFlag(AtomicOr);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_OR_X2
+
+    Inst_FLAT__FLAT_ATOMIC_OR_X2::~Inst_FLAT__FLAT_ATOMIC_OR_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_OR_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] |= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_OR_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_OR_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_OR_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_XOR_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_XOR_X2::Inst_FLAT__FLAT_ATOMIC_XOR_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_xor_x2")
+    {
+        setFlag(AtomicXor);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_XOR_X2
+
+    Inst_FLAT__FLAT_ATOMIC_XOR_X2::~Inst_FLAT__FLAT_ATOMIC_XOR_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_XOR_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] ^= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_XOR_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_XOR_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_XOR_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_INC_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_INC_X2::Inst_FLAT__FLAT_ATOMIC_INC_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_inc_x2")
+    {
+        setFlag(AtomicInc);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_INC_X2
+
+    Inst_FLAT__FLAT_ATOMIC_INC_X2::~Inst_FLAT__FLAT_ATOMIC_INC_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_INC_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_INC_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_INC_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_INC_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_DEC_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_DEC_X2::Inst_FLAT__FLAT_ATOMIC_DEC_X2(
+        InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_dec_x2")
+    {
+        setFlag(AtomicDec);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_DEC_X2
+
+    Inst_FLAT__FLAT_ATOMIC_DEC_X2::~Inst_FLAT__FLAT_ATOMIC_DEC_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_DEC_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
+    // (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_DEC_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_DEC_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_DEC_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_ADD_F32 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_ADD_F32::Inst_FLAT__FLAT_ATOMIC_ADD_F32(
+        InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_add_f32")
+    {
+        setFlag(AtomicAdd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_ADD_F32
+
+    Inst_FLAT__FLAT_ATOMIC_ADD_F32::~Inst_FLAT__FLAT_ATOMIC_ADD_F32()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_ADD_F32
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandF32, VecElemF32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD_F32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemF32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD_F32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandF32, VecElemF32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16::Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16(
+        InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_pk_add_f16")
+    {
+        setFlag(AtomicAdd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16
+
+    Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16::~Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_ADD_F64 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_ADD_F64::Inst_FLAT__FLAT_ATOMIC_ADD_F64(
+        InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_add_f64")
+    {
+        setFlag(AtomicAdd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_ADD_F64
+
+    Inst_FLAT__FLAT_ATOMIC_ADD_F64::~Inst_FLAT__FLAT_ATOMIC_ADD_F64()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_ADD_F64
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandF64, VecElemF64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD_F64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemF64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD_F64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandF64, VecElemF64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_MIN_F64 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_MIN_F64::Inst_FLAT__FLAT_ATOMIC_MIN_F64(
+        InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_min_f64")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_MIN_F64
+
+    Inst_FLAT__FLAT_ATOMIC_MIN_F64::~Inst_FLAT__FLAT_ATOMIC_MIN_F64()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_MIN_F64
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_MIN_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandF64, VecElemF64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_MIN_F64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemF64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_MIN_F64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandF64, VecElemF64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_MAX_F64 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_MAX_F64::Inst_FLAT__FLAT_ATOMIC_MAX_F64(
+        InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_max_f64")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_MAX_F64
+
+    Inst_FLAT__FLAT_ATOMIC_MAX_F64::~Inst_FLAT__FLAT_ATOMIC_MAX_F64()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_MAX_F64
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_MAX_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandF64, VecElemF64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_MAX_F64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemF64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_MAX_F64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandF64, VecElemF64>(gpuDynInst);
+    } // completeAcc
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/inst_util.hh b/src/arch/amdgpu/vega/insts/inst_util.hh
index 7ec2e2ddd3..bc64ff88da 100644
--- a/src/arch/amdgpu/vega/insts/inst_util.hh
+++ b/src/arch/amdgpu/vega/insts/inst_util.hh
@@ -35,6 +35,7 @@
 #include <cmath>
 
 #include "arch/amdgpu/vega/gpu_registers.hh"
+#include "arch/amdgpu/vega/insts/gpu_static_inst.hh"
 
 namespace gem5
 {
@@ -315,7 +316,8 @@ namespace VegaISA
      * 0x142: broadcast 15th thread of each row to next row
      * 0x143: broadcast thread 31 to rows 2 and 3
      */
-    int dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum,
+    inline int
+    dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum,
                     int rowOffset, bool & outOfBounds)
     {
         // local variables
diff --git a/src/arch/amdgpu/vega/insts/instructions.cc b/src/arch/amdgpu/vega/insts/instructions.cc
deleted file mode 100644
index 651b6dc9f9..0000000000
--- a/src/arch/amdgpu/vega/insts/instructions.cc
+++ /dev/null
@@ -1,46539 +0,0 @@
-/*
- * Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "arch/amdgpu/vega/insts/instructions.hh"
-
-#include <cmath>
-
-#include "arch/amdgpu/vega/insts/inst_util.hh"
-#include "debug/VEGA.hh"
-#include "debug/GPUSync.hh"
-#include "dev/amdgpu/hwreg_defines.hh"
-#include "gpu-compute/shader.hh"
-
-namespace gem5
-{
-
-namespace VegaISA
-{
-    // --- Inst_SOP2__S_ADD_U32 class methods ---
-
-    Inst_SOP2__S_ADD_U32::Inst_SOP2__S_ADD_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_add_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ADD_U32
-
-    Inst_SOP2__S_ADD_U32::~Inst_SOP2__S_ADD_U32()
-    {
-    } // ~Inst_SOP2__S_ADD_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u + S1.u;
-    // SCC = (S0.u + S1.u >= 0x800000000ULL ? 1 : 0) is an unsigned
-    // ---  overflow/carry-out for S_ADDC_U32.
-    void
-    Inst_SOP2__S_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() + src1.rawData();
-        scc = ((ScalarRegU64)src0.rawData() + (ScalarRegU64)src1.rawData())
-            >= 0x100000000ULL ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_SUB_U32 class methods ---
-
-    Inst_SOP2__S_SUB_U32::Inst_SOP2__S_SUB_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_sub_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_SUB_U32
-
-    Inst_SOP2__S_SUB_U32::~Inst_SOP2__S_SUB_U32()
-    {
-    } // ~Inst_SOP2__S_SUB_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u - S1.u;
-    // SCC = (S1.u > S0.u ? 1 : 0) is an unsigned overflow or carry-out for
-    // ---  S_SUBB_U32.
-    void
-    Inst_SOP2__S_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() - src1.rawData();
-        scc = (src1.rawData() > src0.rawData()) ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_ADD_I32 class methods ---
-
-    Inst_SOP2__S_ADD_I32::Inst_SOP2__S_ADD_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_add_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ADD_I32
-
-    Inst_SOP2__S_ADD_I32::~Inst_SOP2__S_ADD_I32()
-    {
-    } // ~Inst_SOP2__S_ADD_I32
-
-    // --- description from .arch file ---
-    // D.i = S0.i + S1.i;
-    // SCC = (S0.u[31] == S1.u[31] && S0.u[31] != D.u[31]) is a signed
-    // overflow.
-    // This opcode is not suitable for use with S_ADDC_U32 for implementing
-    // 64-bit operations.
-    void
-    Inst_SOP2__S_ADD_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() + src1.rawData();
-        scc = (bits(src0.rawData(), 31) == bits(src1.rawData(), 31)
-            && bits(src0.rawData(), 31) != bits(sdst.rawData(), 31))
-            ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_SUB_I32 class methods ---
-
-    Inst_SOP2__S_SUB_I32::Inst_SOP2__S_SUB_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_sub_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_SUB_I32
-
-    Inst_SOP2__S_SUB_I32::~Inst_SOP2__S_SUB_I32()
-    {
-    } // ~Inst_SOP2__S_SUB_I32
-
-    // --- description from .arch file ---
-    // D.i = S0.i - S1.i;
-    // SCC = (S0.u[31] != S1.u[31] && S0.u[31] != D.u[31]) is a signed
-    // overflow.
-    // CAUTION: The condition code behaviour for this opcode is inconsistent
-    // with V_SUB_I32; see V_SUB_I32 for further details.
-    // This opcode is not suitable for use with S_SUBB_U32 for implementing
-    // 64-bit operations.
-    void
-    Inst_SOP2__S_SUB_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() - src1.rawData();
-        scc = (bits(src0.rawData(), 31) != bits(src1.rawData(), 31)
-            && bits(src0.rawData(), 31) != bits(sdst.rawData(), 31)) ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_ADDC_U32 class methods ---
-
-    Inst_SOP2__S_ADDC_U32::Inst_SOP2__S_ADDC_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_addc_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ADDC_U32
-
-    Inst_SOP2__S_ADDC_U32::~Inst_SOP2__S_ADDC_U32()
-    {
-    } // ~Inst_SOP2__S_ADDC_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u + S1.u + SCC;
-    // SCC = (S0.u + S1.u + SCC >= 0x800000000ULL ? 1 : 0) is an unsigned
-    // overflow.
-    void
-    Inst_SOP2__S_ADDC_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-        scc.read();
-
-        sdst = src0.rawData() + src1.rawData() + scc.rawData();
-        scc = ((ScalarRegU64)src0.rawData() + (ScalarRegU64)src1.rawData()
-            + (ScalarRegU64)scc.rawData()) >= 0x100000000ULL ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_SUBB_U32 class methods ---
-
-    Inst_SOP2__S_SUBB_U32::Inst_SOP2__S_SUBB_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_subb_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_SUBB_U32
-
-    Inst_SOP2__S_SUBB_U32::~Inst_SOP2__S_SUBB_U32()
-    {
-    } // ~Inst_SOP2__S_SUBB_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u - S1.u - SCC;
-    // SCC = (S1.u + SCC > S0.u ? 1 : 0) is an unsigned overflow.
-    void
-    Inst_SOP2__S_SUBB_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-        scc.read();
-
-        sdst = src0.rawData() - src1.rawData() - scc.rawData();
-        scc = (src1.rawData() + scc.rawData()) > src0.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_MIN_I32 class methods ---
-
-    Inst_SOP2__S_MIN_I32::Inst_SOP2__S_MIN_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_min_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_MIN_I32
-
-    Inst_SOP2__S_MIN_I32::~Inst_SOP2__S_MIN_I32()
-    {
-    } // ~Inst_SOP2__S_MIN_I32
-
-    // --- description from .arch file ---
-    // D.i = (S0.i < S1.i) ? S0.i : S1.i;
-    // SCC = 1 if S0 is chosen as the minimum value.
-    void
-    Inst_SOP2__S_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = std::min(src0.rawData(), src1.rawData());
-        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_MIN_U32 class methods ---
-
-    Inst_SOP2__S_MIN_U32::Inst_SOP2__S_MIN_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_min_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_MIN_U32
-
-    Inst_SOP2__S_MIN_U32::~Inst_SOP2__S_MIN_U32()
-    {
-    } // ~Inst_SOP2__S_MIN_U32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u < S1.u) ? S0.u : S1.u;
-    // SCC = 1 if S0 is chosen as the minimum value.
-    void
-    Inst_SOP2__S_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = std::min(src0.rawData(), src1.rawData());
-        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_MAX_I32 class methods ---
-
-    Inst_SOP2__S_MAX_I32::Inst_SOP2__S_MAX_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_max_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_MAX_I32
-
-    Inst_SOP2__S_MAX_I32::~Inst_SOP2__S_MAX_I32()
-    {
-    } // ~Inst_SOP2__S_MAX_I32
-
-    // --- description from .arch file ---
-    // D.i = (S0.i > S1.i) ? S0.i : S1.i;
-    // SCC = 1 if S0 is chosen as the maximum value.
-    void
-    Inst_SOP2__S_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = std::max(src0.rawData(), src1.rawData());
-        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_MAX_U32 class methods ---
-
-    Inst_SOP2__S_MAX_U32::Inst_SOP2__S_MAX_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_max_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_MAX_U32
-
-    Inst_SOP2__S_MAX_U32::~Inst_SOP2__S_MAX_U32()
-    {
-    } // ~Inst_SOP2__S_MAX_U32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u > S1.u) ? S0.u : S1.u;
-    // SCC = 1 if S0 is chosen as the maximum value.
-    void
-    Inst_SOP2__S_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = std::max(src0.rawData(), src1.rawData());
-        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_CSELECT_B32 class methods ---
-
-    Inst_SOP2__S_CSELECT_B32::Inst_SOP2__S_CSELECT_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_cselect_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_CSELECT_B32
-
-    Inst_SOP2__S_CSELECT_B32::~Inst_SOP2__S_CSELECT_B32()
-    {
-    } // ~Inst_SOP2__S_CSELECT_B32
-
-    // --- description from .arch file ---
-    // D.u = SCC ? S0.u : S1.u (conditional select).
-    void
-    Inst_SOP2__S_CSELECT_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-        scc.read();
-
-        sdst = scc.rawData() ? src0.rawData() : src1.rawData();
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP2__S_CSELECT_B64 class methods ---
-
-    Inst_SOP2__S_CSELECT_B64::Inst_SOP2__S_CSELECT_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_cselect_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_CSELECT_B64
-
-    Inst_SOP2__S_CSELECT_B64::~Inst_SOP2__S_CSELECT_B64()
-    {
-    } // ~Inst_SOP2__S_CSELECT_B64
-
-    // --- description from .arch file ---
-    // D.u64 = SCC ? S0.u64 : S1.u64 (conditional select).
-    void
-    Inst_SOP2__S_CSELECT_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-        scc.read();
-
-        sdst = scc.rawData() ? src0.rawData() : src1.rawData();
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP2__S_AND_B32 class methods ---
-
-    Inst_SOP2__S_AND_B32::Inst_SOP2__S_AND_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_and_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_AND_B32
-
-    Inst_SOP2__S_AND_B32::~Inst_SOP2__S_AND_B32()
-    {
-    } // ~Inst_SOP2__S_AND_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u & S1.u;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_AND_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() & src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_AND_B64 class methods ---
-
-    Inst_SOP2__S_AND_B64::Inst_SOP2__S_AND_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_and_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_AND_B64
-
-    Inst_SOP2__S_AND_B64::~Inst_SOP2__S_AND_B64()
-    {
-    } // ~Inst_SOP2__S_AND_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S0.u64 & S1.u64;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_AND_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() & src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_OR_B32 class methods ---
-
-    Inst_SOP2__S_OR_B32::Inst_SOP2__S_OR_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_or_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_OR_B32
-
-    Inst_SOP2__S_OR_B32::~Inst_SOP2__S_OR_B32()
-    {
-    } // ~Inst_SOP2__S_OR_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u | S1.u;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_OR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() | src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_OR_B64 class methods ---
-
-    Inst_SOP2__S_OR_B64::Inst_SOP2__S_OR_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_or_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_OR_B64
-
-    Inst_SOP2__S_OR_B64::~Inst_SOP2__S_OR_B64()
-    {
-    } // ~Inst_SOP2__S_OR_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S0.u64 | S1.u64;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_OR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() | src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_XOR_B32 class methods ---
-
-    Inst_SOP2__S_XOR_B32::Inst_SOP2__S_XOR_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_xor_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_XOR_B32
-
-    Inst_SOP2__S_XOR_B32::~Inst_SOP2__S_XOR_B32()
-    {
-    } // ~Inst_SOP2__S_XOR_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u ^ S1.u;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() ^ src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_XOR_B64 class methods ---
-
-    Inst_SOP2__S_XOR_B64::Inst_SOP2__S_XOR_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_xor_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_XOR_B64
-
-    Inst_SOP2__S_XOR_B64::~Inst_SOP2__S_XOR_B64()
-    {
-    } // ~Inst_SOP2__S_XOR_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S0.u64 ^ S1.u64;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_XOR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() ^ src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_ANDN2_B32 class methods ---
-
-    Inst_SOP2__S_ANDN2_B32::Inst_SOP2__S_ANDN2_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_andn2_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ANDN2_B32
-
-    Inst_SOP2__S_ANDN2_B32::~Inst_SOP2__S_ANDN2_B32()
-    {
-    } // ~Inst_SOP2__S_ANDN2_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u & ~S1.u;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_ANDN2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() &~ src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_ANDN2_B64 class methods ---
-
-    Inst_SOP2__S_ANDN2_B64::Inst_SOP2__S_ANDN2_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_andn2_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ANDN2_B64
-
-    Inst_SOP2__S_ANDN2_B64::~Inst_SOP2__S_ANDN2_B64()
-    {
-    } // ~Inst_SOP2__S_ANDN2_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S0.u64 & ~S1.u64;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_ANDN2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() &~ src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_ORN2_B32 class methods ---
-
-    Inst_SOP2__S_ORN2_B32::Inst_SOP2__S_ORN2_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_orn2_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ORN2_B32
-
-    Inst_SOP2__S_ORN2_B32::~Inst_SOP2__S_ORN2_B32()
-    {
-    } // ~Inst_SOP2__S_ORN2_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u | ~S1.u;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_ORN2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() |~ src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_ORN2_B64 class methods ---
-
-    Inst_SOP2__S_ORN2_B64::Inst_SOP2__S_ORN2_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_orn2_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ORN2_B64
-
-    Inst_SOP2__S_ORN2_B64::~Inst_SOP2__S_ORN2_B64()
-    {
-    } // ~Inst_SOP2__S_ORN2_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S0.u64 | ~S1.u64;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_ORN2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() |~ src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_NAND_B32 class methods ---
-
-    Inst_SOP2__S_NAND_B32::Inst_SOP2__S_NAND_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_nand_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_NAND_B32
-
-    Inst_SOP2__S_NAND_B32::~Inst_SOP2__S_NAND_B32()
-    {
-    } // ~Inst_SOP2__S_NAND_B32
-
-    // --- description from .arch file ---
-    // D.u = ~(S0.u & S1.u);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_NAND_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = ~(src0.rawData() & src1.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_NAND_B64 class methods ---
-
-    Inst_SOP2__S_NAND_B64::Inst_SOP2__S_NAND_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_nand_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_NAND_B64
-
-    Inst_SOP2__S_NAND_B64::~Inst_SOP2__S_NAND_B64()
-    {
-    } // ~Inst_SOP2__S_NAND_B64
-
-    // --- description from .arch file ---
-    // D.u64 = ~(S0.u64 & S1.u64);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_NAND_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = ~(src0.rawData() & src1.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_NOR_B32 class methods ---
-
-    Inst_SOP2__S_NOR_B32::Inst_SOP2__S_NOR_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_nor_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_NOR_B32
-
-    Inst_SOP2__S_NOR_B32::~Inst_SOP2__S_NOR_B32()
-    {
-    } // ~Inst_SOP2__S_NOR_B32
-
-    // --- description from .arch file ---
-    // D.u = ~(S0.u | S1.u);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_NOR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = ~(src0.rawData() | src1.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_NOR_B64 class methods ---
-
-    Inst_SOP2__S_NOR_B64::Inst_SOP2__S_NOR_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_nor_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_NOR_B64
-
-    Inst_SOP2__S_NOR_B64::~Inst_SOP2__S_NOR_B64()
-    {
-    } // ~Inst_SOP2__S_NOR_B64
-
-    // --- description from .arch file ---
-    // D.u64 = ~(S0.u64 | S1.u64);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_NOR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = ~(src0.rawData() | src1.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_XNOR_B32 class methods ---
-
-    Inst_SOP2__S_XNOR_B32::Inst_SOP2__S_XNOR_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_xnor_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_XNOR_B32
-
-    Inst_SOP2__S_XNOR_B32::~Inst_SOP2__S_XNOR_B32()
-    {
-    } // ~Inst_SOP2__S_XNOR_B32
-
-    // --- description from .arch file ---
-    // D.u = ~(S0.u ^ S1.u);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_XNOR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = ~(src0.rawData() ^ src1.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_XNOR_B64 class methods ---
-
-    Inst_SOP2__S_XNOR_B64::Inst_SOP2__S_XNOR_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_xnor_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_XNOR_B64
-
-    Inst_SOP2__S_XNOR_B64::~Inst_SOP2__S_XNOR_B64()
-    {
-    } // ~Inst_SOP2__S_XNOR_B64
-
-    // --- description from .arch file ---
-    // D.u64 = ~(S0.u64 ^ S1.u64);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_XNOR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = ~(src0.rawData() ^ src1.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_LSHL_B32 class methods ---
-
-    Inst_SOP2__S_LSHL_B32::Inst_SOP2__S_LSHL_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_lshl_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_LSHL_B32
-
-    Inst_SOP2__S_LSHL_B32::~Inst_SOP2__S_LSHL_B32()
-    {
-    } // ~Inst_SOP2__S_LSHL_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u << S1.u[4:0];
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_LSHL_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() << bits(src1.rawData(), 4, 0));
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_LSHL_B64 class methods ---
-
-    Inst_SOP2__S_LSHL_B64::Inst_SOP2__S_LSHL_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_lshl_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_LSHL_B64
-
-    Inst_SOP2__S_LSHL_B64::~Inst_SOP2__S_LSHL_B64()
-    {
-    } // ~Inst_SOP2__S_LSHL_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S0.u64 << S1.u[5:0];
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_LSHL_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() << bits(src1.rawData(), 5, 0));
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_LSHR_B32 class methods ---
-
-    Inst_SOP2__S_LSHR_B32::Inst_SOP2__S_LSHR_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_lshr_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_LSHR_B32
-
-    Inst_SOP2__S_LSHR_B32::~Inst_SOP2__S_LSHR_B32()
-    {
-    } // ~Inst_SOP2__S_LSHR_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u >> S1.u[4:0];
-    // SCC = 1 if result is non-zero.
-    // The vacated bits are set to zero.
-    void
-    Inst_SOP2__S_LSHR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0));
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_LSHR_B64 class methods ---
-
-    Inst_SOP2__S_LSHR_B64::Inst_SOP2__S_LSHR_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_lshr_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_LSHR_B64
-
-    Inst_SOP2__S_LSHR_B64::~Inst_SOP2__S_LSHR_B64()
-    {
-    } // ~Inst_SOP2__S_LSHR_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S0.u64 >> S1.u[5:0];
-    // SCC = 1 if result is non-zero.
-    // The vacated bits are set to zero.
-    void
-    Inst_SOP2__S_LSHR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0));
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_ASHR_I32 class methods ---
-
-    Inst_SOP2__S_ASHR_I32::Inst_SOP2__S_ASHR_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_ashr_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ASHR_I32
-
-    Inst_SOP2__S_ASHR_I32::~Inst_SOP2__S_ASHR_I32()
-    {
-    } // ~Inst_SOP2__S_ASHR_I32
-
-    // --- description from .arch file ---
-    // D.i = signext(S0.i) >> S1.u[4:0];
-    // SCC = 1 if result is non-zero.
-    // The vacated bits are set to the sign bit of the input value.
-    void
-    Inst_SOP2__S_ASHR_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0));
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_ASHR_I64 class methods ---
-
-    Inst_SOP2__S_ASHR_I64::Inst_SOP2__S_ASHR_I64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_ashr_i64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ASHR_I64
-
-    Inst_SOP2__S_ASHR_I64::~Inst_SOP2__S_ASHR_I64()
-    {
-    } // ~Inst_SOP2__S_ASHR_I64
-
-    // --- description from .arch file ---
-    // D.i64 = signext(S0.i64) >> S1.u[5:0];
-    // SCC = 1 if result is non-zero.
-    // The vacated bits are set to the sign bit of the input value.
-    void
-    Inst_SOP2__S_ASHR_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0));
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_BFM_B32 class methods ---
-
-    Inst_SOP2__S_BFM_B32::Inst_SOP2__S_BFM_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_bfm_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_BFM_B32
-
-    Inst_SOP2__S_BFM_B32::~Inst_SOP2__S_BFM_B32()
-    {
-    } // ~Inst_SOP2__S_BFM_B32
-
-    // --- description from .arch file ---
-    // D.u = ((1 << S0.u[4:0]) - 1) << S1.u[4:0] (bitfield mask).
-    void
-    Inst_SOP2__S_BFM_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-
-        src0.read();
-        src1.read();
-
-        sdst = ((1 << bits(src0.rawData(), 4, 0)) - 1)
-            << bits(src1.rawData(), 4, 0);
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP2__S_BFM_B64 class methods ---
-
-    Inst_SOP2__S_BFM_B64::Inst_SOP2__S_BFM_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_bfm_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_BFM_B64
-
-    Inst_SOP2__S_BFM_B64::~Inst_SOP2__S_BFM_B64()
-    {
-    } // ~Inst_SOP2__S_BFM_B64
-
-    // --- description from .arch file ---
-    // D.u64 = ((1ULL << S0.u[5:0]) - 1) << S1.u[5:0] (bitfield mask).
-    void
-    Inst_SOP2__S_BFM_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src0.read();
-        src1.read();
-
-        sdst = ((1ULL << bits(src0.rawData(), 5, 0)) - 1)
-            << bits(src1.rawData(), 5, 0);
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP2__S_MUL_I32 class methods ---
-
-    Inst_SOP2__S_MUL_I32::Inst_SOP2__S_MUL_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_mul_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_MUL_I32
-
-    Inst_SOP2__S_MUL_I32::~Inst_SOP2__S_MUL_I32()
-    {
-    } // ~Inst_SOP2__S_MUL_I32
-
-    // --- description from .arch file ---
-    // D.i = S0.i * S1.i.
-    void
-    Inst_SOP2__S_MUL_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() * src1.rawData();
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP2__S_BFE_U32 class methods ---
-
-    Inst_SOP2__S_BFE_U32::Inst_SOP2__S_BFE_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_bfe_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_BFE_U32
-
-    Inst_SOP2__S_BFE_U32::~Inst_SOP2__S_BFE_U32()
-    {
-    } // ~Inst_SOP2__S_BFE_U32
-
-    // --- description from .arch file ---
-    // Bit field extract. S0 is Data, S1[4:0] is field offset, S1[22:16] is
-    // field width.
-    // D.u = (S0.u>>S1.u[4:0]) & ((1<<S1.u[22:16])-1);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_BFE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0))
-            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_BFE_I32 class methods ---
-
-    Inst_SOP2__S_BFE_I32::Inst_SOP2__S_BFE_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_bfe_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_BFE_I32
-
-    Inst_SOP2__S_BFE_I32::~Inst_SOP2__S_BFE_I32()
-    {
-    } // ~Inst_SOP2__S_BFE_I32
-
-    // --- description from .arch file ---
-    // Bit field extract. S0 is Data, S1[4:0] is field offset, S1[22:16] is
-    // field width.
-    // D.i = (S0.i>>S1.u[4:0]) & ((1<<S1.u[22:16])-1);
-    // Sign-extend the result;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_BFE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0))
-            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
-
-        // Above extracted a signed int of size src1[22:16] bits which needs
-        // to be signed-extended. Check if the MSB of our src1[22:16]-bit
-        // integer is 1, and sign extend it is.
-        //
-        // Note: The description in the Vega ISA manual does not mention to
-        // sign-extend the result. An update description can be found in the
-        // more recent RDNA3 manual here:
-        // https://developer.amd.com/wp-content/resources/
-        //      RDNA3_Shader_ISA_December2022.pdf
-        if (sdst.rawData() >> (bits(src1.rawData(), 22, 16) - 1)) {
-            sdst = sdst.rawData()
-                 | (0xffffffff << bits(src1.rawData(), 22, 16));
-        }
-
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_BFE_U64 class methods ---
-
-    Inst_SOP2__S_BFE_U64::Inst_SOP2__S_BFE_U64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_bfe_u64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_BFE_U64
-
-    Inst_SOP2__S_BFE_U64::~Inst_SOP2__S_BFE_U64()
-    {
-    } // ~Inst_SOP2__S_BFE_U64
-
-    // --- description from .arch file ---
-    // Bit field extract. S0 is Data, S1[5:0] is field offset, S1[22:16] is
-    // field width.
-    // D.u64 = (S0.u64>>S1.u[5:0]) & ((1<<S1.u[22:16])-1);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_BFE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0))
-            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_BFE_I64 class methods ---
-
-    Inst_SOP2__S_BFE_I64::Inst_SOP2__S_BFE_I64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_bfe_i64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_BFE_I64
-
-    Inst_SOP2__S_BFE_I64::~Inst_SOP2__S_BFE_I64()
-    {
-    } // ~Inst_SOP2__S_BFE_I64
-
-    // --- description from .arch file ---
-    // Bit field extract. S0 is Data, S1[5:0] is field offset, S1[22:16] is
-    // field width.
-    // D.i64 = (S0.i64>>S1.u[5:0]) & ((1<<S1.u[22:16])-1);
-    // Sign-extend result;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_BFE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0))
-            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
-
-        // Above extracted a signed int of size src1[22:16] bits which needs
-        // to be signed-extended. Check if the MSB of our src1[22:16]-bit
-        // integer is 1, and sign extend it is.
-        if (sdst.rawData() >> (bits(src1.rawData(), 22, 16) - 1)) {
-            sdst = sdst.rawData()
-                 | 0xffffffffffffffff << bits(src1.rawData(), 22, 16);
-        }
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_CBRANCH_G_FORK class methods ---
-
-    Inst_SOP2__S_CBRANCH_G_FORK::Inst_SOP2__S_CBRANCH_G_FORK(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_cbranch_g_fork")
-    {
-        setFlag(Branch);
-    } // Inst_SOP2__S_CBRANCH_G_FORK
-
-    Inst_SOP2__S_CBRANCH_G_FORK::~Inst_SOP2__S_CBRANCH_G_FORK()
-    {
-    } // ~Inst_SOP2__S_CBRANCH_G_FORK
-
-    // --- description from .arch file ---
-    // mask_pass = S0.u64 & EXEC;
-    // mask_fail = ~S0.u64 & EXEC;
-    // if(mask_pass == EXEC)
-    //     PC = S1.u64;
-    // elsif(mask_fail == EXEC)
-    //     PC += 4;
-    // elsif(bitcount(mask_fail) < bitcount(mask_pass))
-    //     EXEC = mask_fail;
-    //     SGPR[CSP*4] = { S1.u64, mask_pass };
-    //     CSP++;
-    //     PC += 4;
-    // else
-    //     EXEC = mask_pass;
-    //     SGPR[CSP*4] = { PC + 4, mask_fail };
-    //     CSP++;
-    //     PC = S1.u64;
-    // end.
-    // Conditional branch using branch-stack.
-    // S0 = compare mask(vcc or any sgpr) and
-    // S1 = 64-bit byte address of target instruction.
-    // See also S_CBRANCH_JOIN.
-    void
-    Inst_SOP2__S_CBRANCH_G_FORK::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOP2__S_ABSDIFF_I32 class methods ---
-
-    Inst_SOP2__S_ABSDIFF_I32::Inst_SOP2__S_ABSDIFF_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_absdiff_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ABSDIFF_I32
-
-    Inst_SOP2__S_ABSDIFF_I32::~Inst_SOP2__S_ABSDIFF_I32()
-    {
-    } // ~Inst_SOP2__S_ABSDIFF_I32
-
-    // --- description from .arch file ---
-    // D.i = S0.i - S1.i;
-    // if(D.i < 0) then D.i = -D.i;
-    // SCC = 1 if result is non-zero.
-    // Compute the absolute value of difference between two values.
-    void
-    Inst_SOP2__S_ABSDIFF_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        sdst = std::abs(src0.rawData() - src1.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_RFE_RESTORE_B64 class methods ---
-
-    Inst_SOP2__S_RFE_RESTORE_B64::Inst_SOP2__S_RFE_RESTORE_B64(
-          InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_rfe_restore_b64")
-    {
-    } // Inst_SOP2__S_RFE_RESTORE_B64
-
-    Inst_SOP2__S_RFE_RESTORE_B64::~Inst_SOP2__S_RFE_RESTORE_B64()
-    {
-    } // ~Inst_SOP2__S_RFE_RESTORE_B64
-
-    // --- description from .arch file ---
-    // PRIV = 0;
-    // PC = S0.u64;
-    // INST_ATC = S1.u32[0].
-    // Return from exception handler and continue, possibly changing the
-    // ---  instruction ATC mode.
-    // This instruction may only be used within a trap handler.
-    // Use this instruction when the main program may be in a different memory
-    // ---  space than the trap handler.
-    void
-    Inst_SOP2__S_RFE_RESTORE_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOP2__S_MUL_HI_U32 class methods ---
-
-    Inst_SOP2__S_MUL_HI_U32::Inst_SOP2__S_MUL_HI_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_mul_hi_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_MUL_HI_U32
-
-    Inst_SOP2__S_MUL_HI_U32::~Inst_SOP2__S_MUL_HI_U32()
-    {
-    } // ~Inst_SOP2__S_MUL_HI_U32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u * S1.u) >> 32;
-    void
-    Inst_SOP2__S_MUL_HI_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-
-        src0.read();
-        src1.read();
-
-        VecElemU64 tmp_dst =
-            ((VecElemU64)src0.rawData() * (VecElemU64)src1.rawData());
-        sdst = (tmp_dst >> 32);
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP2__S_MUL_HI_I32 class methods ---
-
-    Inst_SOP2__S_MUL_HI_I32::Inst_SOP2__S_MUL_HI_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_mul_hi_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_MUL_HI_I32
-
-    Inst_SOP2__S_MUL_HI_I32::~Inst_SOP2__S_MUL_HI_I32()
-    {
-    } // ~Inst_SOP2__S_MUL_HI_I32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u * S1.u) >> 32;
-    void
-    Inst_SOP2__S_MUL_HI_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src0.read();
-        src1.read();
-
-        VecElemI64 tmp_src0 =
-            sext<std::numeric_limits<VecElemI64>::digits>(src0.rawData());
-        VecElemI64 tmp_src1 =
-            sext<std::numeric_limits<VecElemI64>::digits>(src1.rawData());
-        sdst = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOPK__S_MOVK_I32 class methods ---
-
-    Inst_SOPK__S_MOVK_I32::Inst_SOPK__S_MOVK_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_movk_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_MOVK_I32
-
-    Inst_SOPK__S_MOVK_I32::~Inst_SOPK__S_MOVK_I32()
-    {
-    } // ~Inst_SOPK__S_MOVK_I32
-
-    // --- description from .arch file ---
-    // D.i = signext(SIMM16) (sign extension).
-    void
-    Inst_SOPK__S_MOVK_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        sdst = simm16;
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOPK__S_CMOVK_I32 class methods ---
-
-    Inst_SOPK__S_CMOVK_I32::Inst_SOPK__S_CMOVK_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmovk_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMOVK_I32
-
-    Inst_SOPK__S_CMOVK_I32::~Inst_SOPK__S_CMOVK_I32()
-    {
-    } // ~Inst_SOPK__S_CMOVK_I32
-
-    // --- description from .arch file ---
-    // if(SCC) then D.i = signext(SIMM16);
-    // else NOP.
-    // Conditional move with sign extension.
-    void
-    Inst_SOPK__S_CMOVK_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        scc.read();
-
-        if (scc.rawData()) {
-            sdst = simm16;
-            sdst.write();
-        }
-    } // execute
-    // --- Inst_SOPK__S_CMPK_EQ_I32 class methods ---
-
-    Inst_SOPK__S_CMPK_EQ_I32::Inst_SOPK__S_CMPK_EQ_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_eq_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_EQ_I32
-
-    Inst_SOPK__S_CMPK_EQ_I32::~Inst_SOPK__S_CMPK_EQ_I32()
-    {
-    } // ~Inst_SOPK__S_CMPK_EQ_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i == signext(SIMM16)).
-    void
-    Inst_SOPK__S_CMPK_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() == simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_LG_I32 class methods ---
-
-    Inst_SOPK__S_CMPK_LG_I32::Inst_SOPK__S_CMPK_LG_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_lg_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_LG_I32
-
-    Inst_SOPK__S_CMPK_LG_I32::~Inst_SOPK__S_CMPK_LG_I32()
-    {
-    } // ~Inst_SOPK__S_CMPK_LG_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i != signext(SIMM16)).
-    void
-    Inst_SOPK__S_CMPK_LG_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() != simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_GT_I32 class methods ---
-
-    Inst_SOPK__S_CMPK_GT_I32::Inst_SOPK__S_CMPK_GT_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_gt_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_GT_I32
-
-    Inst_SOPK__S_CMPK_GT_I32::~Inst_SOPK__S_CMPK_GT_I32()
-    {
-    } // ~Inst_SOPK__S_CMPK_GT_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i > signext(SIMM16)).
-    void
-    Inst_SOPK__S_CMPK_GT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() > simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_GE_I32 class methods ---
-
-    Inst_SOPK__S_CMPK_GE_I32::Inst_SOPK__S_CMPK_GE_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_ge_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_GE_I32
-
-    Inst_SOPK__S_CMPK_GE_I32::~Inst_SOPK__S_CMPK_GE_I32()
-    {
-    } // ~Inst_SOPK__S_CMPK_GE_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i >= signext(SIMM16)).
-    void
-    Inst_SOPK__S_CMPK_GE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() >= simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_LT_I32 class methods ---
-
-    Inst_SOPK__S_CMPK_LT_I32::Inst_SOPK__S_CMPK_LT_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_lt_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_LT_I32
-
-    Inst_SOPK__S_CMPK_LT_I32::~Inst_SOPK__S_CMPK_LT_I32()
-    {
-    } // ~Inst_SOPK__S_CMPK_LT_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i < signext(SIMM16)).
-    void
-    Inst_SOPK__S_CMPK_LT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() < simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_LE_I32 class methods ---
-
-    Inst_SOPK__S_CMPK_LE_I32::Inst_SOPK__S_CMPK_LE_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_le_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_LE_I32
-
-    Inst_SOPK__S_CMPK_LE_I32::~Inst_SOPK__S_CMPK_LE_I32()
-    {
-    } // ~Inst_SOPK__S_CMPK_LE_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i <= signext(SIMM16)).
-    void
-    Inst_SOPK__S_CMPK_LE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() <= simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_EQ_U32 class methods ---
-
-    Inst_SOPK__S_CMPK_EQ_U32::Inst_SOPK__S_CMPK_EQ_U32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_eq_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_EQ_U32
-
-    Inst_SOPK__S_CMPK_EQ_U32::~Inst_SOPK__S_CMPK_EQ_U32()
-    {
-    } // ~Inst_SOPK__S_CMPK_EQ_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u == SIMM16).
-    void
-    Inst_SOPK__S_CMPK_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
-        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() == simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_LG_U32 class methods ---
-
-    Inst_SOPK__S_CMPK_LG_U32::Inst_SOPK__S_CMPK_LG_U32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_lg_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_LG_U32
-
-    Inst_SOPK__S_CMPK_LG_U32::~Inst_SOPK__S_CMPK_LG_U32()
-    {
-    } // ~Inst_SOPK__S_CMPK_LG_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u != SIMM16).
-    void
-    Inst_SOPK__S_CMPK_LG_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
-        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() != simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_GT_U32 class methods ---
-
-    Inst_SOPK__S_CMPK_GT_U32::Inst_SOPK__S_CMPK_GT_U32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_gt_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_GT_U32
-
-    Inst_SOPK__S_CMPK_GT_U32::~Inst_SOPK__S_CMPK_GT_U32()
-    {
-    } // ~Inst_SOPK__S_CMPK_GT_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u > SIMM16).
-    void
-    Inst_SOPK__S_CMPK_GT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
-        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() > simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_GE_U32 class methods ---
-
-    Inst_SOPK__S_CMPK_GE_U32::Inst_SOPK__S_CMPK_GE_U32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_ge_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_GE_U32
-
-    Inst_SOPK__S_CMPK_GE_U32::~Inst_SOPK__S_CMPK_GE_U32()
-    {
-    } // ~Inst_SOPK__S_CMPK_GE_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u >= SIMM16).
-    void
-    Inst_SOPK__S_CMPK_GE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
-        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() >= simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_LT_U32 class methods ---
-
-    Inst_SOPK__S_CMPK_LT_U32::Inst_SOPK__S_CMPK_LT_U32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_lt_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_LT_U32
-
-    Inst_SOPK__S_CMPK_LT_U32::~Inst_SOPK__S_CMPK_LT_U32()
-    {
-    } // ~Inst_SOPK__S_CMPK_LT_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u < SIMM16).
-    void
-    Inst_SOPK__S_CMPK_LT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
-        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() < simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_LE_U32 class methods ---
-
-    Inst_SOPK__S_CMPK_LE_U32::Inst_SOPK__S_CMPK_LE_U32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_le_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_LE_U32
-
-    Inst_SOPK__S_CMPK_LE_U32::~Inst_SOPK__S_CMPK_LE_U32()
-    {
-    } // ~Inst_SOPK__S_CMPK_LE_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u <= SIMM16).
-    void
-    Inst_SOPK__S_CMPK_LE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
-        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() <= simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_ADDK_I32 class methods ---
-
-    Inst_SOPK__S_ADDK_I32::Inst_SOPK__S_ADDK_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_addk_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_ADDK_I32
-
-    Inst_SOPK__S_ADDK_I32::~Inst_SOPK__S_ADDK_I32()
-    {
-    } // ~Inst_SOPK__S_ADDK_I32
-
-    // --- description from .arch file ---
-    // D.i = D.i + signext(SIMM16);
-    // SCC = overflow.
-    void
-    Inst_SOPK__S_ADDK_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI16 simm16 = instData.SIMM16;
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = src.rawData() + (ScalarRegI32)sext<16>(simm16);
-        scc = (bits(src.rawData(), 31) == bits(simm16, 15)
-            && bits(src.rawData(), 31) != bits(sdst.rawData(), 31)) ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_MULK_I32 class methods ---
-
-    Inst_SOPK__S_MULK_I32::Inst_SOPK__S_MULK_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_mulk_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_MULK_I32
-
-    Inst_SOPK__S_MULK_I32::~Inst_SOPK__S_MULK_I32()
-    {
-    } // ~Inst_SOPK__S_MULK_I32
-
-    // --- description from .arch file ---
-    // D.i = D.i * signext(SIMM16).
-    void
-    Inst_SOPK__S_MULK_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI16 simm16 = instData.SIMM16;
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = src.rawData() * (ScalarRegI32)sext<16>(simm16);
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOPK__S_CBRANCH_I_FORK class methods ---
-
-    Inst_SOPK__S_CBRANCH_I_FORK::Inst_SOPK__S_CBRANCH_I_FORK(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cbranch_i_fork")
-    {
-        setFlag(Branch);
-    } // Inst_SOPK__S_CBRANCH_I_FORK
-
-    Inst_SOPK__S_CBRANCH_I_FORK::~Inst_SOPK__S_CBRANCH_I_FORK()
-    {
-    } // ~Inst_SOPK__S_CBRANCH_I_FORK
-
-    // --- description from .arch file ---
-    // mask_pass = S0.u64 & EXEC;
-    // mask_fail = ~S0.u64 & EXEC;
-    // target_addr = PC + signext(SIMM16 * 4) + 4;
-    // if(mask_pass == EXEC)
-    //     PC = target_addr;
-    // elsif(mask_fail == EXEC)
-    //     PC += 4;
-    // elsif(bitcount(mask_fail) < bitcount(mask_pass))
-    //     EXEC = mask_fail;
-    //     SGPR[CSP*4] = { target_addr, mask_pass };
-    //     CSP++;
-    //     PC += 4;
-    // else
-    //     EXEC = mask_pass;
-    //     SGPR[CSP*4] = { PC + 4, mask_fail };
-    //     CSP++;
-    //     PC = target_addr;
-    // end.
-    // Conditional branch using branch-stack.
-    // S0 = compare mask(vcc or any sgpr), and
-    // SIMM16 = signed DWORD branch offset relative to next instruction.
-    // See also S_CBRANCH_JOIN.
-    void
-    Inst_SOPK__S_CBRANCH_I_FORK::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPK__S_GETREG_B32 class methods ---
-
-    Inst_SOPK__S_GETREG_B32::Inst_SOPK__S_GETREG_B32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_getreg_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_GETREG_B32
-
-    Inst_SOPK__S_GETREG_B32::~Inst_SOPK__S_GETREG_B32()
-    {
-    } // ~Inst_SOPK__S_GETREG_B32
-
-    // --- description from .arch file ---
-    // D.u = hardware-reg. Read some or all of a hardware register into the
-    // LSBs of D.
-    // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
-    // is 1..32.
-    void
-    Inst_SOPK__S_GETREG_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI16 simm16 = instData.SIMM16;
-        ScalarRegU32 hwregId = simm16 & 0x3f;
-        ScalarRegU32 offset = (simm16 >> 6) & 31;
-        ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;
-
-        ScalarRegU32 hwreg =
-            gpuDynInst->computeUnit()->shader->getHwReg(hwregId);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        sdst.read();
-
-        // Store value from hardware to part of the SDST.
-        ScalarRegU32 mask = (((1U << size) - 1U) << offset);
-        sdst = (hwreg & mask) >> offset;
-        sdst.write();
-    } // execute
-    // --- Inst_SOPK__S_SETREG_B32 class methods ---
-
-    Inst_SOPK__S_SETREG_B32::Inst_SOPK__S_SETREG_B32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_setreg_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_SETREG_B32
-
-    Inst_SOPK__S_SETREG_B32::~Inst_SOPK__S_SETREG_B32()
-    {
-    } // ~Inst_SOPK__S_SETREG_B32
-
-    // --- description from .arch file ---
-    // hardware-reg = S0.u. Write some or all of the LSBs of D into a hardware
-    // register.
-    // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
-    // is 1..32.
-    void
-    Inst_SOPK__S_SETREG_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI16 simm16 = instData.SIMM16;
-        ScalarRegU32 hwregId = simm16 & 0x3f;
-        ScalarRegU32 offset = (simm16 >> 6) & 31;
-        ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;
-
-        ScalarRegU32 hwreg =
-            gpuDynInst->computeUnit()->shader->getHwReg(hwregId);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        sdst.read();
-
-        // Store value from SDST to part of the hardware register.
-        ScalarRegU32 mask = (((1U << size) - 1U) << offset);
-        hwreg = ((hwreg & ~mask) | ((sdst.rawData() << offset) & mask));
-        gpuDynInst->computeUnit()->shader->setHwReg(hwregId, hwreg);
-
-        // set MODE register to control the behavior of single precision
-        // floating-point numbers: denormal mode or round mode
-        if (hwregId==1 && size==2
-                        && (offset==4 || offset==0)) {
-            warn_once("Be cautious that s_setreg_b32 has no real effect "
-                            "on FP modes: %s\n", gpuDynInst->disassemble());
-            return;
-        }
-
-        // panic if not changing MODE of floating-point numbers
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPK__S_SETREG_IMM32_B32 class methods ---
-
-    Inst_SOPK__S_SETREG_IMM32_B32::Inst_SOPK__S_SETREG_IMM32_B32(
-          InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_setreg_imm32_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_SETREG_IMM32_B32
-
-    Inst_SOPK__S_SETREG_IMM32_B32::~Inst_SOPK__S_SETREG_IMM32_B32()
-    {
-    } // ~Inst_SOPK__S_SETREG_IMM32_B32
-
-    // --- description from .arch file ---
-    // Write some or all of the LSBs of IMM32 into a hardware register; this
-    // ---  instruction requires a 32-bit literal constant.
-    // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
-    // is 1..32.
-    void
-    Inst_SOPK__S_SETREG_IMM32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI16 simm16 = instData.SIMM16;
-        ScalarRegU32 hwregId = simm16 & 0x3f;
-        ScalarRegU32 offset = (simm16 >> 6) & 31;
-        ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;
-
-        ScalarRegU32 hwreg =
-            gpuDynInst->computeUnit()->shader->getHwReg(hwregId);
-        ScalarRegI32 simm32 = extData.imm_u32;
-
-        // Store value from SIMM32 to part of the hardware register.
-        ScalarRegU32 mask = (((1U << size) - 1U) << offset);
-        hwreg = ((hwreg & ~mask) | ((simm32 << offset) & mask));
-        gpuDynInst->computeUnit()->shader->setHwReg(hwregId, hwreg);
-
-        // set MODE register to control the behavior of single precision
-        // floating-point numbers: denormal mode or round mode
-        if (hwregId==HW_REG_MODE && size==2
-                        && (offset==4 || offset==0)) {
-            warn_once("Be cautious that s_setreg_imm32_b32 has no real effect "
-                            "on FP modes: %s\n", gpuDynInst->disassemble());
-            return;
-        }
-
-        // panic if not changing modes of single-precision FPs
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOP1__S_MOV_B32 class methods ---
-
-    Inst_SOP1__S_MOV_B32::Inst_SOP1__S_MOV_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_mov_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_MOV_B32
-
-    Inst_SOP1__S_MOV_B32::~Inst_SOP1__S_MOV_B32()
-    {
-    } // ~Inst_SOP1__S_MOV_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u.
-    void
-    Inst_SOP1__S_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = src.rawData();
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_MOV_B64 class methods ---
-
-    Inst_SOP1__S_MOV_B64::Inst_SOP1__S_MOV_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_mov_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_MOV_B64
-
-    Inst_SOP1__S_MOV_B64::~Inst_SOP1__S_MOV_B64()
-    {
-    } // ~Inst_SOP1__S_MOV_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S0.u64.
-    void
-    Inst_SOP1__S_MOV_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = src.rawData();
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_CMOV_B32 class methods ---
-
-    Inst_SOP1__S_CMOV_B32::Inst_SOP1__S_CMOV_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_cmov_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_CMOV_B32
-
-    Inst_SOP1__S_CMOV_B32::~Inst_SOP1__S_CMOV_B32()
-    {
-    } // ~Inst_SOP1__S_CMOV_B32
-
-    // --- description from .arch file ---
-    // (SCC) then D.u = S0.u;
-    // else NOP.
-    // Conditional move.
-    void
-    Inst_SOP1__S_CMOV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-        scc.read();
-
-        if (scc.rawData()) {
-            sdst = src.rawData();
-            sdst.write();
-        }
-    } // execute
-    // --- Inst_SOP1__S_CMOV_B64 class methods ---
-
-    Inst_SOP1__S_CMOV_B64::Inst_SOP1__S_CMOV_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_cmov_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_CMOV_B64
-
-    Inst_SOP1__S_CMOV_B64::~Inst_SOP1__S_CMOV_B64()
-    {
-    } // ~Inst_SOP1__S_CMOV_B64
-
-    // --- description from .arch file ---
-    // if(SCC) then D.u64 = S0.u64;
-    // else NOP.
-    // Conditional move.
-    void
-    Inst_SOP1__S_CMOV_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-        scc.read();
-
-        if (scc.rawData()) {
-            sdst = src.rawData();
-            sdst.write();
-        }
-    } // execute
-    // --- Inst_SOP1__S_NOT_B32 class methods ---
-
-    Inst_SOP1__S_NOT_B32::Inst_SOP1__S_NOT_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_not_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_NOT_B32
-
-    Inst_SOP1__S_NOT_B32::~Inst_SOP1__S_NOT_B32()
-    {
-    } // ~Inst_SOP1__S_NOT_B32
-
-    // --- description from .arch file ---
-    // D.u = ~S0.u;
-    // SCC = 1 if result is non-zero.
-    // Bitwise negation.
-    void
-    Inst_SOP1__S_NOT_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = ~src.rawData();
-
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_NOT_B64 class methods ---
-
-    Inst_SOP1__S_NOT_B64::Inst_SOP1__S_NOT_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_not_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_NOT_B64
-
-    Inst_SOP1__S_NOT_B64::~Inst_SOP1__S_NOT_B64()
-    {
-    } // ~Inst_SOP1__S_NOT_B64
-
-    // --- description from .arch file ---
-    // D.u64 = ~S0.u64;
-    // SCC = 1 if result is non-zero.
-    // Bitwise negation.
-    void
-    Inst_SOP1__S_NOT_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = ~src.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_WQM_B32 class methods ---
-
-    Inst_SOP1__S_WQM_B32::Inst_SOP1__S_WQM_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_wqm_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_WQM_B32
-
-    Inst_SOP1__S_WQM_B32::~Inst_SOP1__S_WQM_B32()
-    {
-    } // ~Inst_SOP1__S_WQM_B32
-
-    // --- description from .arch file ---
-    // D[i] = (S0[(i & ~3):(i | 3)] != 0);
-    // Computes whole quad mode for an active/valid mask.
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_WQM_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wholeQuadMode(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_WQM_B64 class methods ---
-
-    Inst_SOP1__S_WQM_B64::Inst_SOP1__S_WQM_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_wqm_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_WQM_B64
-
-    Inst_SOP1__S_WQM_B64::~Inst_SOP1__S_WQM_B64()
-    {
-    } // ~Inst_SOP1__S_WQM_B64
-
-    // --- description from .arch file ---
-    // D[i] = (S0[(i & ~3):(i | 3)] != 0);
-    // Computes whole quad mode for an active/valid mask.
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_WQM_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wholeQuadMode(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_BREV_B32 class methods ---
-
-    Inst_SOP1__S_BREV_B32::Inst_SOP1__S_BREV_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_brev_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BREV_B32
-
-    Inst_SOP1__S_BREV_B32::~Inst_SOP1__S_BREV_B32()
-    {
-    } // ~Inst_SOP1__S_BREV_B32
-
-    // --- description from .arch file ---
-    // D.u[31:0] = S0.u[0:31] (reverse bits).
-    void
-    Inst_SOP1__S_BREV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = reverseBits(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_BREV_B64 class methods ---
-
-    Inst_SOP1__S_BREV_B64::Inst_SOP1__S_BREV_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_brev_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BREV_B64
-
-    Inst_SOP1__S_BREV_B64::~Inst_SOP1__S_BREV_B64()
-    {
-    } // ~Inst_SOP1__S_BREV_B64
-
-    // --- description from .arch file ---
-    // D.u64[63:0] = S0.u64[0:63] (reverse bits).
-    void
-    Inst_SOP1__S_BREV_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = reverseBits(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_BCNT0_I32_B32 class methods ---
-
-    Inst_SOP1__S_BCNT0_I32_B32::Inst_SOP1__S_BCNT0_I32_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bcnt0_i32_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BCNT0_I32_B32
-
-    Inst_SOP1__S_BCNT0_I32_B32::~Inst_SOP1__S_BCNT0_I32_B32()
-    {
-    } // ~Inst_SOP1__S_BCNT0_I32_B32
-
-    // --- description from .arch file ---
-    // D.i = CountZeroBits(S0.u);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_BCNT0_I32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = countZeroBits(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_BCNT0_I32_B64 class methods ---
-
-    Inst_SOP1__S_BCNT0_I32_B64::Inst_SOP1__S_BCNT0_I32_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bcnt0_i32_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BCNT0_I32_B64
-
-    Inst_SOP1__S_BCNT0_I32_B64::~Inst_SOP1__S_BCNT0_I32_B64()
-    {
-    } // ~Inst_SOP1__S_BCNT0_I32_B64
-
-    // --- description from .arch file ---
-    // D.i = CountZeroBits(S0.u64);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_BCNT0_I32_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = countZeroBits(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_BCNT1_I32_B32 class methods ---
-
-    Inst_SOP1__S_BCNT1_I32_B32::Inst_SOP1__S_BCNT1_I32_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bcnt1_i32_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BCNT1_I32_B32
-
-    Inst_SOP1__S_BCNT1_I32_B32::~Inst_SOP1__S_BCNT1_I32_B32()
-    {
-    } // ~Inst_SOP1__S_BCNT1_I32_B32
-
-    // --- description from .arch file ---
-    // D.i = CountOneBits(S0.u);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_BCNT1_I32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = popCount(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_BCNT1_I32_B64 class methods ---
-
-    Inst_SOP1__S_BCNT1_I32_B64::Inst_SOP1__S_BCNT1_I32_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bcnt1_i32_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BCNT1_I32_B64
-
-    Inst_SOP1__S_BCNT1_I32_B64::~Inst_SOP1__S_BCNT1_I32_B64()
-    {
-    } // ~Inst_SOP1__S_BCNT1_I32_B64
-
-    // --- description from .arch file ---
-    // D.i = CountOneBits(S0.u64);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_BCNT1_I32_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = popCount(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_FF0_I32_B32 class methods ---
-
-    Inst_SOP1__S_FF0_I32_B32::Inst_SOP1__S_FF0_I32_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_ff0_i32_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FF0_I32_B32
-
-    Inst_SOP1__S_FF0_I32_B32::~Inst_SOP1__S_FF0_I32_B32()
-    {
-    } // ~Inst_SOP1__S_FF0_I32_B32
-
-    // --- description from .arch file ---
-    // D.i = FindFirstZero(S0.u);
-    // If no zeros are found, return -1.
-    // Returns the bit position of the first zero from the LSB.
-    void
-    Inst_SOP1__S_FF0_I32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = findFirstZero(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_FF0_I32_B64 class methods ---
-
-    Inst_SOP1__S_FF0_I32_B64::Inst_SOP1__S_FF0_I32_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_ff0_i32_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FF0_I32_B64
-
-    Inst_SOP1__S_FF0_I32_B64::~Inst_SOP1__S_FF0_I32_B64()
-    {
-    } // ~Inst_SOP1__S_FF0_I32_B64
-
-    // --- description from .arch file ---
-    // D.i = FindFirstZero(S0.u64);
-    // If no zeros are found, return -1.
-    // Returns the bit position of the first zero from the LSB.
-    void
-    Inst_SOP1__S_FF0_I32_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = findFirstZero(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_FF1_I32_B32 class methods ---
-
-    Inst_SOP1__S_FF1_I32_B32::Inst_SOP1__S_FF1_I32_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_ff1_i32_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FF1_I32_B32
-
-    Inst_SOP1__S_FF1_I32_B32::~Inst_SOP1__S_FF1_I32_B32()
-    {
-    } // ~Inst_SOP1__S_FF1_I32_B32
-
-    // --- description from .arch file ---
-    // D.i = FindFirstOne(S0.u);
-    // If no ones are found, return -1.
-    // Returns the bit position of the first one from the LSB.
-    void
-    Inst_SOP1__S_FF1_I32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = findFirstOne(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_FF1_I32_B64 class methods ---
-
-    Inst_SOP1__S_FF1_I32_B64::Inst_SOP1__S_FF1_I32_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_ff1_i32_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FF1_I32_B64
-
-    Inst_SOP1__S_FF1_I32_B64::~Inst_SOP1__S_FF1_I32_B64()
-    {
-    } // ~Inst_SOP1__S_FF1_I32_B64
-
-    // --- description from .arch file ---
-    // D.i = FindFirstOne(S0.u64);
-    // If no ones are found, return -1.
-    // Returns the bit position of the first one from the LSB.
-    void
-    Inst_SOP1__S_FF1_I32_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = findFirstOne(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_FLBIT_I32_B32 class methods ---
-
-    Inst_SOP1__S_FLBIT_I32_B32::Inst_SOP1__S_FLBIT_I32_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_flbit_i32_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FLBIT_I32_B32
-
-    Inst_SOP1__S_FLBIT_I32_B32::~Inst_SOP1__S_FLBIT_I32_B32()
-    {
-    } // ~Inst_SOP1__S_FLBIT_I32_B32
-
-    // --- description from .arch file ---
-    // D.i = FindFirstOne(S0.u);
-    // If no ones are found, return -1.
-    // Counts how many zeros before the first one starting from the MSB.
-    void
-    Inst_SOP1__S_FLBIT_I32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = countZeroBitsMsb(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_FLBIT_I32_B64 class methods ---
-
-    Inst_SOP1__S_FLBIT_I32_B64::Inst_SOP1__S_FLBIT_I32_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_flbit_i32_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FLBIT_I32_B64
-
-    Inst_SOP1__S_FLBIT_I32_B64::~Inst_SOP1__S_FLBIT_I32_B64()
-    {
-    } // ~Inst_SOP1__S_FLBIT_I32_B64
-
-    // --- description from .arch file ---
-    // D.i = FindFirstOne(S0.u64);
-    // If no ones are found, return -1.
-    // Counts how many zeros before the first one starting from the MSB.
-    void
-    Inst_SOP1__S_FLBIT_I32_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = countZeroBitsMsb(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_FLBIT_I32 class methods ---
-
-    Inst_SOP1__S_FLBIT_I32::Inst_SOP1__S_FLBIT_I32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_flbit_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FLBIT_I32
-
-    Inst_SOP1__S_FLBIT_I32::~Inst_SOP1__S_FLBIT_I32()
-    {
-    } // ~Inst_SOP1__S_FLBIT_I32
-
-    // --- description from .arch file ---
-    // D.i = FirstOppositeSignBit(S0.i);
-    // If S0.i == 0 or S0.i == -1 (all bits are the same), return -1.
-    // Counts how many bits in a row (from MSB to LSB) are the same as the
-    // sign bit.
-    void
-    Inst_SOP1__S_FLBIT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = firstOppositeSignBit(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_FLBIT_I32_I64 class methods ---
-
-    Inst_SOP1__S_FLBIT_I32_I64::Inst_SOP1__S_FLBIT_I32_I64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_flbit_i32_i64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FLBIT_I32_I64
-
-    Inst_SOP1__S_FLBIT_I32_I64::~Inst_SOP1__S_FLBIT_I32_I64()
-    {
-    } // ~Inst_SOP1__S_FLBIT_I32_I64
-
-    // --- description from .arch file ---
-    // D.i = FirstOppositeSignBit(S0.i64);
-    // If S0.i == 0 or S0.i == -1 (all bits are the same), return -1.
-    // Counts how many bits in a row (from MSB to LSB) are the same as the
-    // sign bit.
-    void
-    Inst_SOP1__S_FLBIT_I32_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = firstOppositeSignBit(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_SEXT_I32_I8 class methods ---
-
-    Inst_SOP1__S_SEXT_I32_I8::Inst_SOP1__S_SEXT_I32_I8(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_sext_i32_i8")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_SEXT_I32_I8
-
-    Inst_SOP1__S_SEXT_I32_I8::~Inst_SOP1__S_SEXT_I32_I8()
-    {
-    } // ~Inst_SOP1__S_SEXT_I32_I8
-
-    // --- description from .arch file ---
-    // D.i = signext(S0.i[7:0]) (sign extension).
-    void
-    Inst_SOP1__S_SEXT_I32_I8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = sext<std::numeric_limits<ScalarRegI8>::digits>(
-            bits(src.rawData(), 7, 0));
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_SEXT_I32_I16 class methods ---
-
-    Inst_SOP1__S_SEXT_I32_I16::Inst_SOP1__S_SEXT_I32_I16(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_sext_i32_i16")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_SEXT_I32_I16
-
-    Inst_SOP1__S_SEXT_I32_I16::~Inst_SOP1__S_SEXT_I32_I16()
-    {
-    } // ~Inst_SOP1__S_SEXT_I32_I16
-
-    // --- description from .arch file ---
-    // D.i = signext(S0.i[15:0]) (sign extension).
-    void
-    Inst_SOP1__S_SEXT_I32_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = sext<std::numeric_limits<ScalarRegI16>::digits>(
-            bits(src.rawData(), 15, 0));
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_BITSET0_B32 class methods ---
-
-    Inst_SOP1__S_BITSET0_B32::Inst_SOP1__S_BITSET0_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bitset0_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BITSET0_B32
-
-    Inst_SOP1__S_BITSET0_B32::~Inst_SOP1__S_BITSET0_B32()
-    {
-    } // ~Inst_SOP1__S_BITSET0_B32
-
-    // --- description from .arch file ---
-    // D.u[S0.u[4:0]] = 0.
-    void
-    Inst_SOP1__S_BITSET0_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst.setBit(bits(src.rawData(), 4, 0), 0);
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_BITSET0_B64 class methods ---
-
-    Inst_SOP1__S_BITSET0_B64::Inst_SOP1__S_BITSET0_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bitset0_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BITSET0_B64
-
-    Inst_SOP1__S_BITSET0_B64::~Inst_SOP1__S_BITSET0_B64()
-    {
-    } // ~Inst_SOP1__S_BITSET0_B64
-
-    // --- description from .arch file ---
-    // D.u64[S0.u[5:0]] = 0.
-    void
-    Inst_SOP1__S_BITSET0_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst.setBit(bits(src.rawData(), 5, 0), 0);
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_BITSET1_B32 class methods ---
-
-    Inst_SOP1__S_BITSET1_B32::Inst_SOP1__S_BITSET1_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bitset1_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BITSET1_B32
-
-    Inst_SOP1__S_BITSET1_B32::~Inst_SOP1__S_BITSET1_B32()
-    {
-    } // ~Inst_SOP1__S_BITSET1_B32
-
-    // --- description from .arch file ---
-    // D.u[S0.u[4:0]] = 1.
-    void
-    Inst_SOP1__S_BITSET1_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst.setBit(bits(src.rawData(), 4, 0), 1);
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_BITSET1_B64 class methods ---
-
-    Inst_SOP1__S_BITSET1_B64::Inst_SOP1__S_BITSET1_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bitset1_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BITSET1_B64
-
-    Inst_SOP1__S_BITSET1_B64::~Inst_SOP1__S_BITSET1_B64()
-    {
-    } // ~Inst_SOP1__S_BITSET1_B64
-
-    // --- description from .arch file ---
-    // D.u64[S0.u[5:0]] = 1.
-    void
-    Inst_SOP1__S_BITSET1_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst.setBit(bits(src.rawData(), 5, 0), 1);
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_GETPC_B64 class methods ---
-
-    Inst_SOP1__S_GETPC_B64::Inst_SOP1__S_GETPC_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_getpc_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_GETPC_B64
-
-    Inst_SOP1__S_GETPC_B64::~Inst_SOP1__S_GETPC_B64()
-    {
-    } // ~Inst_SOP1__S_GETPC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = PC + 4.
-    // Destination receives the byte address of the next instruction.
-    void
-    Inst_SOP1__S_GETPC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Addr pc = gpuDynInst->pc();
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        sdst = pc + 4;
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_SETPC_B64 class methods ---
-
-    Inst_SOP1__S_SETPC_B64::Inst_SOP1__S_SETPC_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_setpc_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_SETPC_B64
-
-    Inst_SOP1__S_SETPC_B64::~Inst_SOP1__S_SETPC_B64()
-    {
-    } // ~Inst_SOP1__S_SETPC_B64
-
-    // --- description from .arch file ---
-    // PC = S0.u64.
-    // S0.u64 is a byte address of the instruction to jump to.
-    void
-    Inst_SOP1__S_SETPC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-
-        src.read();
-
-        wf->pc(src.rawData());
-    } // execute
-    // --- Inst_SOP1__S_SWAPPC_B64 class methods ---
-
-    Inst_SOP1__S_SWAPPC_B64::Inst_SOP1__S_SWAPPC_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_swappc_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_SWAPPC_B64
-
-    Inst_SOP1__S_SWAPPC_B64::~Inst_SOP1__S_SWAPPC_B64()
-    {
-    } // ~Inst_SOP1__S_SWAPPC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = PC + 4; PC = S0.u64.
-    // S0.u64 is a byte address of the instruction to jump to.
-    void
-    Inst_SOP1__S_SWAPPC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        Addr pc = gpuDynInst->pc();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = pc + 4;
-
-        wf->pc(src.rawData());
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_RFE_B64 class methods ---
-
-    Inst_SOP1__S_RFE_B64::Inst_SOP1__S_RFE_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_rfe_b64")
-    {
-    } // Inst_SOP1__S_RFE_B64
-
-    Inst_SOP1__S_RFE_B64::~Inst_SOP1__S_RFE_B64()
-    {
-    } // ~Inst_SOP1__S_RFE_B64
-
-    // --- description from .arch file ---
-    // PRIV = 0;
-    // PC = S0.u64.
-    // Return from exception handler and continue.
-    // This instruction may only be used within a trap handler.
-    void
-    Inst_SOP1__S_RFE_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOP1__S_AND_SAVEEXEC_B64 class methods ---
-
-    Inst_SOP1__S_AND_SAVEEXEC_B64::Inst_SOP1__S_AND_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_and_saveexec_b64")
-    {
-        setFlag(ALU);
-        setFlag(ReadsEXEC);
-        setFlag(WritesEXEC);
-    } // Inst_SOP1__S_AND_SAVEEXEC_B64
-
-    Inst_SOP1__S_AND_SAVEEXEC_B64::~Inst_SOP1__S_AND_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_AND_SAVEEXEC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = EXEC;
-    // EXEC = S0.u64 & EXEC;
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_AND_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = src.rawData() & wf->execMask().to_ullong();
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_OR_SAVEEXEC_B64 class methods ---
-
-    Inst_SOP1__S_OR_SAVEEXEC_B64::Inst_SOP1__S_OR_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_or_saveexec_b64")
-    {
-        setFlag(ALU);
-        setFlag(ReadsEXEC);
-        setFlag(WritesEXEC);
-    } // Inst_SOP1__S_OR_SAVEEXEC_B64
-
-    Inst_SOP1__S_OR_SAVEEXEC_B64::~Inst_SOP1__S_OR_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_OR_SAVEEXEC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = EXEC;
-    // EXEC = S0.u64 | EXEC;
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_OR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = src.rawData() | wf->execMask().to_ullong();
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_XOR_SAVEEXEC_B64 class methods ---
-
-    Inst_SOP1__S_XOR_SAVEEXEC_B64::Inst_SOP1__S_XOR_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_xor_saveexec_b64")
-    {
-        setFlag(ALU);
-        setFlag(ReadsEXEC);
-        setFlag(WritesEXEC);
-    } // Inst_SOP1__S_XOR_SAVEEXEC_B64
-
-    Inst_SOP1__S_XOR_SAVEEXEC_B64::~Inst_SOP1__S_XOR_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_XOR_SAVEEXEC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = EXEC;
-    // EXEC = S0.u64 ^ EXEC;
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_XOR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = src.rawData() ^ wf->execMask().to_ullong();
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_ANDN2_SAVEEXEC_B64 class methods ---
-
-    Inst_SOP1__S_ANDN2_SAVEEXEC_B64::Inst_SOP1__S_ANDN2_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_andn2_saveexec_b64")
-    {
-        setFlag(ALU);
-        setFlag(ReadsEXEC);
-        setFlag(WritesEXEC);
-    } // Inst_SOP1__S_ANDN2_SAVEEXEC_B64
-
-    Inst_SOP1__S_ANDN2_SAVEEXEC_B64::~Inst_SOP1__S_ANDN2_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_ANDN2_SAVEEXEC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = EXEC;
-    // EXEC = S0.u64 & ~EXEC;
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_ANDN2_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = src.rawData() &~ wf->execMask().to_ullong();
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_ORN2_SAVEEXEC_B64 class methods ---
-
-    Inst_SOP1__S_ORN2_SAVEEXEC_B64::Inst_SOP1__S_ORN2_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_orn2_saveexec_b64")
-    {
-        setFlag(ALU);
-        setFlag(ReadsEXEC);
-        setFlag(WritesEXEC);
-    } // Inst_SOP1__S_ORN2_SAVEEXEC_B64
-
-    Inst_SOP1__S_ORN2_SAVEEXEC_B64::~Inst_SOP1__S_ORN2_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_ORN2_SAVEEXEC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = EXEC;
-    // EXEC = S0.u64 | ~EXEC;
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_ORN2_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = src.rawData() |~ wf->execMask().to_ullong();
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_NAND_SAVEEXEC_B64 class methods ---
-
-    Inst_SOP1__S_NAND_SAVEEXEC_B64::Inst_SOP1__S_NAND_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_nand_saveexec_b64")
-    {
-        setFlag(ALU);
-        setFlag(ReadsEXEC);
-        setFlag(WritesEXEC);
-    } // Inst_SOP1__S_NAND_SAVEEXEC_B64
-
-    Inst_SOP1__S_NAND_SAVEEXEC_B64::~Inst_SOP1__S_NAND_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_NAND_SAVEEXEC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = EXEC;
-    // EXEC = ~(S0.u64 & EXEC);
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_NAND_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = ~(src.rawData() & wf->execMask().to_ullong());
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_NOR_SAVEEXEC_B64 class methods ---
-
-    Inst_SOP1__S_NOR_SAVEEXEC_B64::Inst_SOP1__S_NOR_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_nor_saveexec_b64")
-    {
-        setFlag(ALU);
-        setFlag(ReadsEXEC);
-        setFlag(WritesEXEC);
-    } // Inst_SOP1__S_NOR_SAVEEXEC_B64
-
-    Inst_SOP1__S_NOR_SAVEEXEC_B64::~Inst_SOP1__S_NOR_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_NOR_SAVEEXEC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = EXEC;
-    // EXEC = ~(S0.u64 | EXEC);
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_NOR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = ~(src.rawData() | wf->execMask().to_ullong());
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_XNOR_SAVEEXEC_B64 class methods ---
-
-    Inst_SOP1__S_XNOR_SAVEEXEC_B64::Inst_SOP1__S_XNOR_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_xnor_saveexec_b64")
-    {
-        setFlag(ALU);
-        setFlag(ReadsEXEC);
-        setFlag(WritesEXEC);
-    } // Inst_SOP1__S_XNOR_SAVEEXEC_B64
-
-    Inst_SOP1__S_XNOR_SAVEEXEC_B64::~Inst_SOP1__S_XNOR_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_XNOR_SAVEEXEC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = EXEC;
-    // EXEC = ~(S0.u64 ^ EXEC);
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_XNOR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = ~(src.rawData() ^ wf->execMask().to_ullong());
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_QUADMASK_B32 class methods ---
-
-    Inst_SOP1__S_QUADMASK_B32::Inst_SOP1__S_QUADMASK_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_quadmask_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_QUADMASK_B32
-
-    Inst_SOP1__S_QUADMASK_B32::~Inst_SOP1__S_QUADMASK_B32()
-    {
-    } // ~Inst_SOP1__S_QUADMASK_B32
-
-    // --- description from .arch file ---
-    // D.u = QuadMask(S0.u):
-    // D[0] = OR(S0[3:0]), D[1] = OR(S0[7:4]) ... D[31:8] = 0;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_QUADMASK_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = quadMask(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_QUADMASK_B64 class methods ---
-
-    Inst_SOP1__S_QUADMASK_B64::Inst_SOP1__S_QUADMASK_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_quadmask_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_QUADMASK_B64
-
-    Inst_SOP1__S_QUADMASK_B64::~Inst_SOP1__S_QUADMASK_B64()
-    {
-    } // ~Inst_SOP1__S_QUADMASK_B64
-
-    // --- description from .arch file ---
-    // D.u64 = QuadMask(S0.u64):
-    // D[0] = OR(S0[3:0]), D[1] = OR(S0[7:4]) ... D[63:16] = 0;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_QUADMASK_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = quadMask(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_MOVRELS_B32 class methods ---
-
-    Inst_SOP1__S_MOVRELS_B32::Inst_SOP1__S_MOVRELS_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_movrels_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_MOVRELS_B32
-
-    Inst_SOP1__S_MOVRELS_B32::~Inst_SOP1__S_MOVRELS_B32()
-    {
-    } // ~Inst_SOP1__S_MOVRELS_B32
-
-    // --- description from .arch file ---
-    // D.u = SGPR[S0.u + M0.u].u (move from relative source).
-    void
-    Inst_SOP1__S_MOVRELS_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
-        m0.read();
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0 + m0.rawData());
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = src.rawData();
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_MOVRELS_B64 class methods ---
-
-    Inst_SOP1__S_MOVRELS_B64::Inst_SOP1__S_MOVRELS_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_movrels_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_MOVRELS_B64
-
-    Inst_SOP1__S_MOVRELS_B64::~Inst_SOP1__S_MOVRELS_B64()
-    {
-    } // ~Inst_SOP1__S_MOVRELS_B64
-
-    // --- description from .arch file ---
-    // D.u64 = SGPR[S0.u + M0.u].u64 (move from relative source).
-    // The index in M0.u must be even for this operation.
-    void
-    Inst_SOP1__S_MOVRELS_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
-        m0.read();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0 + m0.rawData());
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = src.rawData();
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_MOVRELD_B32 class methods ---
-
-    Inst_SOP1__S_MOVRELD_B32::Inst_SOP1__S_MOVRELD_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_movreld_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_MOVRELD_B32
-
-    Inst_SOP1__S_MOVRELD_B32::~Inst_SOP1__S_MOVRELD_B32()
-    {
-    } // ~Inst_SOP1__S_MOVRELD_B32
-
-    // --- description from .arch file ---
-    // SGPR[D.u + M0.u].u = S0.u (move to relative destination).
-    void
-    Inst_SOP1__S_MOVRELD_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
-        m0.read();
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST + m0.rawData());
-
-        src.read();
-
-        sdst = src.rawData();
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_MOVRELD_B64 class methods ---
-
-    Inst_SOP1__S_MOVRELD_B64::Inst_SOP1__S_MOVRELD_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_movreld_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_MOVRELD_B64
-
-    Inst_SOP1__S_MOVRELD_B64::~Inst_SOP1__S_MOVRELD_B64()
-    {
-    } // ~Inst_SOP1__S_MOVRELD_B64
-
-    // --- description from .arch file ---
-    // SGPR[D.u + M0.u].u64 = S0.u64 (move to relative destination).
-    // The index in M0.u must be even for this operation.
-    void
-    Inst_SOP1__S_MOVRELD_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
-        m0.read();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST + m0.rawData());
-
-        src.read();
-
-        sdst = src.rawData();
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_CBRANCH_JOIN class methods ---
-
-    Inst_SOP1__S_CBRANCH_JOIN::Inst_SOP1__S_CBRANCH_JOIN(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_cbranch_join")
-    {
-        setFlag(Branch);
-        setFlag(WritesEXEC);
-    } // Inst_SOP1__S_CBRANCH_JOIN
-
-    Inst_SOP1__S_CBRANCH_JOIN::~Inst_SOP1__S_CBRANCH_JOIN()
-    {
-    } // ~Inst_SOP1__S_CBRANCH_JOIN
-
-    // --- description from .arch file ---
-    // saved_csp = S0.u;
-    // if(CSP == saved_csp) then
-    //     PC += 4; // Second time to JOIN: continue with program.
-    // else
-    //     CSP -= 1; // First time to JOIN; jump to other FORK path.
-    //     {PC, EXEC} = SGPR[CSP * 4]; // Read 128 bits from 4 consecutive
-    //     SGPRs.
-    // end
-    // Conditional branch join point (end of conditional branch block). S0 is
-    // saved CSP value.
-    // See S_CBRANCH_G_FORK and S_CBRANCH_I_FORK for related instructions.
-    void
-    Inst_SOP1__S_CBRANCH_JOIN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOP1__S_ABS_I32 class methods ---
-
-    Inst_SOP1__S_ABS_I32::Inst_SOP1__S_ABS_I32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_abs_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_ABS_I32
-
-    Inst_SOP1__S_ABS_I32::~Inst_SOP1__S_ABS_I32()
-    {
-    } // ~Inst_SOP1__S_ABS_I32
-
-    // --- description from .arch file ---
-    // if(S.i < 0) then D.i = -S.i;
-    // else D.i = S.i;
-    // SCC = 1 if result is non-zero.
-    // Integer absolute value.
-    void
-    Inst_SOP1__S_ABS_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = std::abs(src.rawData());
-
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_MOV_FED_B32 class methods ---
-
-    Inst_SOP1__S_MOV_FED_B32::Inst_SOP1__S_MOV_FED_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_mov_fed_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_MOV_FED_B32
-
-    Inst_SOP1__S_MOV_FED_B32::~Inst_SOP1__S_MOV_FED_B32()
-    {
-    } // ~Inst_SOP1__S_MOV_FED_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u. Introduce an EDC double-detect error on write to the
-    // destination SGPR.
-    void
-    Inst_SOP1__S_MOV_FED_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOP1__S_SET_GPR_IDX_IDX class methods ---
-
-    Inst_SOP1__S_SET_GPR_IDX_IDX::Inst_SOP1__S_SET_GPR_IDX_IDX(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_set_gpr_idx_idx")
-    {
-    } // Inst_SOP1__S_SET_GPR_IDX_IDX
-
-    Inst_SOP1__S_SET_GPR_IDX_IDX::~Inst_SOP1__S_SET_GPR_IDX_IDX()
-    {
-    } // ~Inst_SOP1__S_SET_GPR_IDX_IDX
-
-    // --- description from .arch file ---
-    // M0[7:0] = S0.u[7:0].
-    // Modify the index used in vector GPR indexing.
-    void
-    Inst_SOP1__S_SET_GPR_IDX_IDX::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPC__S_CMP_EQ_I32 class methods ---
-
-    Inst_SOPC__S_CMP_EQ_I32::Inst_SOPC__S_CMP_EQ_I32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_eq_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_EQ_I32
-
-    Inst_SOPC__S_CMP_EQ_I32::~Inst_SOPC__S_CMP_EQ_I32()
-    {
-    } // ~Inst_SOPC__S_CMP_EQ_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i == S1.i).
-    void
-    Inst_SOPC__S_CMP_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() == src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_LG_I32 class methods ---
-
-    Inst_SOPC__S_CMP_LG_I32::Inst_SOPC__S_CMP_LG_I32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_lg_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_LG_I32
-
-    Inst_SOPC__S_CMP_LG_I32::~Inst_SOPC__S_CMP_LG_I32()
-    {
-    } // ~Inst_SOPC__S_CMP_LG_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i != S1.i).
-    void
-    Inst_SOPC__S_CMP_LG_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() != src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_GT_I32 class methods ---
-
-    Inst_SOPC__S_CMP_GT_I32::Inst_SOPC__S_CMP_GT_I32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_gt_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_GT_I32
-
-    Inst_SOPC__S_CMP_GT_I32::~Inst_SOPC__S_CMP_GT_I32()
-    {
-    } // ~Inst_SOPC__S_CMP_GT_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i > S1.i).
-    void
-    Inst_SOPC__S_CMP_GT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_GE_I32 class methods ---
-
-    Inst_SOPC__S_CMP_GE_I32::Inst_SOPC__S_CMP_GE_I32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_ge_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_GE_I32
-
-    Inst_SOPC__S_CMP_GE_I32::~Inst_SOPC__S_CMP_GE_I32()
-    {
-    } // ~Inst_SOPC__S_CMP_GE_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i >= S1.i).
-    void
-    Inst_SOPC__S_CMP_GE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() >= src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_LT_I32 class methods ---
-
-    Inst_SOPC__S_CMP_LT_I32::Inst_SOPC__S_CMP_LT_I32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_lt_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_LT_I32
-
-    Inst_SOPC__S_CMP_LT_I32::~Inst_SOPC__S_CMP_LT_I32()
-    {
-    } // ~Inst_SOPC__S_CMP_LT_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i < S1.i).
-    void
-    Inst_SOPC__S_CMP_LT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_LE_I32 class methods ---
-
-    Inst_SOPC__S_CMP_LE_I32::Inst_SOPC__S_CMP_LE_I32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_le_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_LE_I32
-
-    Inst_SOPC__S_CMP_LE_I32::~Inst_SOPC__S_CMP_LE_I32()
-    {
-    } // ~Inst_SOPC__S_CMP_LE_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i <= S1.i).
-    void
-    Inst_SOPC__S_CMP_LE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() <= src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_EQ_U32 class methods ---
-
-    Inst_SOPC__S_CMP_EQ_U32::Inst_SOPC__S_CMP_EQ_U32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_eq_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_EQ_U32
-
-    Inst_SOPC__S_CMP_EQ_U32::~Inst_SOPC__S_CMP_EQ_U32()
-    {
-    } // ~Inst_SOPC__S_CMP_EQ_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u == S1.u).
-    void
-    Inst_SOPC__S_CMP_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() == src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_LG_U32 class methods ---
-
-    Inst_SOPC__S_CMP_LG_U32::Inst_SOPC__S_CMP_LG_U32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_lg_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_LG_U32
-
-    Inst_SOPC__S_CMP_LG_U32::~Inst_SOPC__S_CMP_LG_U32()
-    {
-    } // ~Inst_SOPC__S_CMP_LG_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u != S1.u).
-    void
-    Inst_SOPC__S_CMP_LG_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() != src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_GT_U32 class methods ---
-
-    Inst_SOPC__S_CMP_GT_U32::Inst_SOPC__S_CMP_GT_U32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_gt_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_GT_U32
-
-    Inst_SOPC__S_CMP_GT_U32::~Inst_SOPC__S_CMP_GT_U32()
-    {
-    } // ~Inst_SOPC__S_CMP_GT_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u > S1.u).
-    void
-    Inst_SOPC__S_CMP_GT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_GE_U32 class methods ---
-
-    Inst_SOPC__S_CMP_GE_U32::Inst_SOPC__S_CMP_GE_U32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_ge_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_GE_U32
-
-    Inst_SOPC__S_CMP_GE_U32::~Inst_SOPC__S_CMP_GE_U32()
-    {
-    } // ~Inst_SOPC__S_CMP_GE_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u >= S1.u).
-    void
-    Inst_SOPC__S_CMP_GE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() >= src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_LT_U32 class methods ---
-
-    Inst_SOPC__S_CMP_LT_U32::Inst_SOPC__S_CMP_LT_U32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_lt_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_LT_U32
-
-    Inst_SOPC__S_CMP_LT_U32::~Inst_SOPC__S_CMP_LT_U32()
-    {
-    } // ~Inst_SOPC__S_CMP_LT_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u < S1.u).
-    void
-    Inst_SOPC__S_CMP_LT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_LE_U32 class methods ---
-
-    Inst_SOPC__S_CMP_LE_U32::Inst_SOPC__S_CMP_LE_U32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_le_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_LE_U32
-
-    Inst_SOPC__S_CMP_LE_U32::~Inst_SOPC__S_CMP_LE_U32()
-    {
-    } // ~Inst_SOPC__S_CMP_LE_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u <= S1.u).
-    void
-    Inst_SOPC__S_CMP_LE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() <= src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_BITCMP0_B32 class methods ---
-
-    Inst_SOPC__S_BITCMP0_B32::Inst_SOPC__S_BITCMP0_B32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_bitcmp0_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_BITCMP0_B32
-
-    Inst_SOPC__S_BITCMP0_B32::~Inst_SOPC__S_BITCMP0_B32()
-    {
-    } // ~Inst_SOPC__S_BITCMP0_B32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u[S1.u[4:0]] == 0).
-    void
-    Inst_SOPC__S_BITCMP0_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = !bits(src0.rawData(), bits(src1.rawData(), 4, 0)) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_BITCMP1_B32 class methods ---
-
-    Inst_SOPC__S_BITCMP1_B32::Inst_SOPC__S_BITCMP1_B32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_bitcmp1_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_BITCMP1_B32
-
-    Inst_SOPC__S_BITCMP1_B32::~Inst_SOPC__S_BITCMP1_B32()
-    {
-    } // ~Inst_SOPC__S_BITCMP1_B32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u[S1.u[4:0]] == 1).
-    void
-    Inst_SOPC__S_BITCMP1_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = bits(src0.rawData(), bits(src1.rawData(), 4, 0)) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_BITCMP0_B64 class methods ---
-
-    Inst_SOPC__S_BITCMP0_B64::Inst_SOPC__S_BITCMP0_B64(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_bitcmp0_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_BITCMP0_B64
-
-    Inst_SOPC__S_BITCMP0_B64::~Inst_SOPC__S_BITCMP0_B64()
-    {
-    } // ~Inst_SOPC__S_BITCMP0_B64
-
-    // --- description from .arch file ---
-    // SCC = (S0.u64[S1.u[5:0]] == 0).
-    void
-    Inst_SOPC__S_BITCMP0_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = !bits(src0.rawData(), bits(src1.rawData(), 5, 0)) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_BITCMP1_B64 class methods ---
-
-    Inst_SOPC__S_BITCMP1_B64::Inst_SOPC__S_BITCMP1_B64(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_bitcmp1_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_BITCMP1_B64
-
-    Inst_SOPC__S_BITCMP1_B64::~Inst_SOPC__S_BITCMP1_B64()
-    {
-    } // ~Inst_SOPC__S_BITCMP1_B64
-
-    // --- description from .arch file ---
-    // SCC = (S0.u64[S1.u[5:0]] == 1).
-    void
-    Inst_SOPC__S_BITCMP1_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = bits(src0.rawData(), bits(src1.rawData(), 5, 0)) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_SETVSKIP class methods ---
-
-    Inst_SOPC__S_SETVSKIP::Inst_SOPC__S_SETVSKIP(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_setvskip")
-    {
-    } // Inst_SOPC__S_SETVSKIP
-
-    Inst_SOPC__S_SETVSKIP::~Inst_SOPC__S_SETVSKIP()
-    {
-    } // ~Inst_SOPC__S_SETVSKIP
-
-    // --- description from .arch file ---
-    // VSKIP = S0.u[S1.u[4:0]].
-    // Enables and disables VSKIP mode.
-    // When VSKIP is enabled, no VOP*/M*BUF/MIMG/DS/FLAT/EXP instuctions are
-    // issued.
-    // If any vector operations are outstanding, S_WAITCNT must be issued
-    // before executing.
-    // This instruction requires one waitstate after executing (e.g. S_NOP 0).
-    // Example:
-    //     s_waitcnt 0
-    //     s_setvskip 1, 0  // Enable vskip mode.
-    //     s_nop 1
-    void
-    Inst_SOPC__S_SETVSKIP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPC__S_SET_GPR_IDX_ON class methods ---
-
-    Inst_SOPC__S_SET_GPR_IDX_ON::Inst_SOPC__S_SET_GPR_IDX_ON(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_set_gpr_idx_on")
-    {
-    } // Inst_SOPC__S_SET_GPR_IDX_ON
-
-    Inst_SOPC__S_SET_GPR_IDX_ON::~Inst_SOPC__S_SET_GPR_IDX_ON()
-    {
-    } // ~Inst_SOPC__S_SET_GPR_IDX_ON
-
-    // --- description from .arch file ---
-    // MODE.gpr_idx_en = 1;
-    // M0[7:0] = S0.u[7:0];
-    // M0[15:12] = SIMM4 (direct contents of S1 field);
-    // // Remaining bits of M0 are unmodified.
-    // Enable GPR indexing mode. Vector operations after this will perform
-    // relative GPR addressing based on the contents of M0. The structure
-    // SQ_M0_GPR_IDX_WORD may be used to decode M0.
-    // The raw contents of the S1 field are read and used to set the enable
-    // bits. S1[0] = VSRC0_REL, S1[1] = VSRC1_REL, S1[2] = VSRC2_REL and
-    // S1[3] = VDST_REL.
-    void
-    Inst_SOPC__S_SET_GPR_IDX_ON::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPC__S_CMP_EQ_U64 class methods ---
-
-    Inst_SOPC__S_CMP_EQ_U64::Inst_SOPC__S_CMP_EQ_U64(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_eq_u64")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_EQ_U64
-
-    Inst_SOPC__S_CMP_EQ_U64::~Inst_SOPC__S_CMP_EQ_U64()
-    {
-    } // ~Inst_SOPC__S_CMP_EQ_U64
-
-    // --- description from .arch file ---
-    // SCC = (S0.i64 == S1.i64).
-    void
-    Inst_SOPC__S_CMP_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() == src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_LG_U64 class methods ---
-
-    Inst_SOPC__S_CMP_LG_U64::Inst_SOPC__S_CMP_LG_U64(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_lg_u64")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_LG_U64
-
-    Inst_SOPC__S_CMP_LG_U64::~Inst_SOPC__S_CMP_LG_U64()
-    {
-    } // ~Inst_SOPC__S_CMP_LG_U64
-
-    // --- description from .arch file ---
-    // SCC = (S0.i64 != S1.i64).
-    void
-    Inst_SOPC__S_CMP_LG_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() != src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPP__S_NOP class methods ---
-
-    Inst_SOPP__S_NOP::Inst_SOPP__S_NOP(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_nop")
-    {
-        setFlag(Nop);
-    } // Inst_SOPP__S_NOP
-
-    Inst_SOPP__S_NOP::~Inst_SOPP__S_NOP()
-    {
-    } // ~Inst_SOPP__S_NOP
-
-    // --- description from .arch file ---
-    // Do nothing. Repeat NOP 1..8 times based on SIMM16[2:0] -- 0 = 1 time,
-    // 7 = 8 times.
-    // This instruction may be used to introduce wait states to resolve
-    // hazards; see the shader programming guide for details. Compare with
-    // S_SLEEP.
-    void
-    Inst_SOPP__S_NOP::execute(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_SOPP__S_ENDPGM class methods ---
-
-    Inst_SOPP__S_ENDPGM::Inst_SOPP__S_ENDPGM(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_endpgm")
-    {
-        setFlag(EndOfKernel);
-    } // Inst_SOPP__S_ENDPGM
-
-    Inst_SOPP__S_ENDPGM::~Inst_SOPP__S_ENDPGM()
-    {
-    } // ~Inst_SOPP__S_ENDPGM
-
-    // --- description from .arch file ---
-    // End of program; terminate wavefront.
-    // The hardware implicitly executes S_WAITCNT 0 before executing this
-    // ---  instruction.
-    // See S_ENDPGM_SAVED for the context-switch version of this instruction.
-    void
-    Inst_SOPP__S_ENDPGM::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ComputeUnit *cu = gpuDynInst->computeUnit();
-
-        // delete extra instructions fetched for completed work-items
-        wf->instructionBuffer.erase(wf->instructionBuffer.begin() + 1,
-            wf->instructionBuffer.end());
-
-        if (wf->pendingFetch) {
-            wf->dropFetch = true;
-        }
-
-        wf->computeUnit->fetchStage.fetchUnit(wf->simdId)
-            .flushBuf(wf->wfSlotId);
-        wf->setStatus(Wavefront::S_STOPPED);
-
-        int refCount = wf->computeUnit->getLds()
-            .decreaseRefCounter(wf->dispatchId, wf->wgId);
-
-        /**
-         * The parent WF of this instruction is exiting, therefore
-         * it should not participate in this barrier any longer. This
-         * prevents possible deadlock issues if WFs exit early.
-         */
-        int bar_id = WFBarrier::InvalidID;
-        if (wf->hasBarrier()) {
-            assert(wf->getStatus() != Wavefront::S_BARRIER);
-            bar_id = wf->barrierId();
-            assert(bar_id != WFBarrier::InvalidID);
-            wf->releaseBarrier();
-            cu->decMaxBarrierCnt(bar_id);
-            DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Exiting the "
-                    "program and decrementing max barrier count for "
-                    "barrier Id%d. New max count: %d.\n", cu->cu_id,
-                    wf->simdId, wf->wfSlotId, wf->wfDynId, bar_id,
-                    cu->maxBarrierCnt(bar_id));
-        }
-
-        DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
-            wf->computeUnit->cu_id, wf->wgId, refCount);
-
-        wf->computeUnit->registerManager->freeRegisters(wf);
-        wf->computeUnit->stats.completedWfs++;
-        wf->computeUnit->activeWaves--;
-
-        panic_if(wf->computeUnit->activeWaves < 0, "CU[%d] Active waves less "
-            "than zero\n", wf->computeUnit->cu_id);
-
-        DPRINTF(GPUExec, "Doing return for CU%d: WF[%d][%d][%d]\n",
-            wf->computeUnit->cu_id, wf->simdId, wf->wfSlotId, wf->wfDynId);
-
-        for (int i = 0; i < wf->vecReads.size(); i++) {
-            if (wf->rawDist.find(i) != wf->rawDist.end()) {
-                wf->stats.readsPerWrite.sample(wf->vecReads.at(i));
-            }
-        }
-        wf->vecReads.clear();
-        wf->rawDist.clear();
-        wf->lastInstExec = 0;
-
-        if (!refCount) {
-            /**
-             * If all WFs have finished, and hence the WG has finished,
-             * then we can free up the barrier belonging to the parent
-             * WG, but only if we actually used a barrier (i.e., more
-             * than one WF in the WG).
-             */
-            if (bar_id != WFBarrier::InvalidID) {
-                DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - All waves are "
-                        "now complete. Releasing barrier Id%d.\n", cu->cu_id,
-                        wf->simdId, wf->wfSlotId, wf->wfDynId,
-                        wf->barrierId());
-                cu->releaseBarrier(bar_id);
-            }
-
-           /**
-             * Last wavefront of the workgroup has executed return. If the
-             * workgroup is not the final one in the kernel, then simply
-             * retire it; however, if it is the final one, i.e., indicating
-             * the kernel end, then release operation (i.e., GL2 WB) is
-             * needed
-             */
-
-            //check whether the workgroup is indicating the kernel end, i.e.,
-            //the last workgroup in the kernel
-            bool kernelEnd =
-                wf->computeUnit->shader->dispatcher().isReachingKernelEnd(wf);
-
-            bool relNeeded =
-                wf->computeUnit->shader->impl_kern_end_rel;
-
-            //if it is not a kernel end, then retire the workgroup directly
-            if (!kernelEnd || !relNeeded) {
-                wf->computeUnit->shader->dispatcher().notifyWgCompl(wf);
-                wf->setStatus(Wavefront::S_STOPPED);
-                wf->computeUnit->stats.completedWGs++;
-
-                return;
-            }
-
-            /**
-             * if it is a kernel end, inject a memory sync, i.e., GL2 WB, and
-             * retire the workgroup after receving response.
-             * note that GL0V and GL1 are read only, and they just forward GL2
-             * WB request. When forwarding, GL1 send the request to all GL2 in
-             * the complex
-             */
-            setFlag(MemSync);
-            setFlag(GlobalSegment);
-            // Notify Memory System of Kernel Completion
-            // Kernel End = isKernel + isMemSync
-            wf->setStatus(Wavefront::S_RETURNING);
-            gpuDynInst->simdId = wf->simdId;
-            gpuDynInst->wfSlotId = wf->wfSlotId;
-            gpuDynInst->wfDynId = wf->wfDynId;
-
-            DPRINTF(GPUExec, "inject global memory fence for CU%d: "
-                            "WF[%d][%d][%d]\n", wf->computeUnit->cu_id,
-                            wf->simdId, wf->wfSlotId, wf->wfDynId);
-
-            // call shader to prepare the flush operations
-            wf->computeUnit->shader->prepareFlush(gpuDynInst);
-
-            wf->computeUnit->stats.completedWGs++;
-        } else {
-            wf->computeUnit->shader->dispatcher().scheduleDispatch();
-        }
-    } // execute
-
-    // --- Inst_SOPP__S_BRANCH class methods ---
-
-    Inst_SOPP__S_BRANCH::Inst_SOPP__S_BRANCH(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_branch")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_BRANCH
-
-    Inst_SOPP__S_BRANCH::~Inst_SOPP__S_BRANCH()
-    {
-    } // ~Inst_SOPP__S_BRANCH
-
-    // --- description from .arch file ---
-    // PC = PC + signext(SIMM16 * 4) + 4 (short jump).
-    // For a long jump, use S_SETPC.
-    void
-    Inst_SOPP__S_BRANCH::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        Addr pc = gpuDynInst->pc();
-        ScalarRegI16 simm16 = instData.SIMM16;
-
-        pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
-
-        wf->pc(pc);
-    } // execute
-    // --- Inst_SOPP__S_WAKEUP class methods ---
-
-    Inst_SOPP__S_WAKEUP::Inst_SOPP__S_WAKEUP(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_wakeup")
-    {
-    } // Inst_SOPP__S_WAKEUP
-
-    Inst_SOPP__S_WAKEUP::~Inst_SOPP__S_WAKEUP()
-    {
-    } // ~Inst_SOPP__S_WAKEUP
-
-    // --- description from .arch file ---
-    // Allow a wave to 'ping' all the other waves in its threadgroup to force
-    // them to wake up immediately from an S_SLEEP instruction. The ping is
-    // ignored if the waves are not sleeping.
-    // This allows for more efficient polling on a memory location. The waves
-    // which are polling can sit in a long S_SLEEP between memory reads, but
-    // the wave which writes the value can tell them all to wake up early now
-    // that the data is available. This is useful for fBarrier implementations
-    // (speedup).
-    // This method is also safe from races because if any wave misses the ping,
-    // everything still works fine (whoever missed it just completes their
-    // normal S_SLEEP).
-    void
-    Inst_SOPP__S_WAKEUP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_SCC0 class methods ---
-
-    Inst_SOPP__S_CBRANCH_SCC0::Inst_SOPP__S_CBRANCH_SCC0(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_scc0")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_CBRANCH_SCC0
-
-    Inst_SOPP__S_CBRANCH_SCC0::~Inst_SOPP__S_CBRANCH_SCC0()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_SCC0
-
-    // --- description from .arch file ---
-    // if(SCC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_SCC0::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        Addr pc = gpuDynInst->pc();
-        ScalarRegI16 simm16 = instData.SIMM16;
-        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        scc.read();
-
-        if (!scc.rawData()) {
-            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
-        }
-
-        wf->pc(pc);
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_SCC1 class methods ---
-
-    Inst_SOPP__S_CBRANCH_SCC1::Inst_SOPP__S_CBRANCH_SCC1(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_scc1")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_CBRANCH_SCC1
-
-    Inst_SOPP__S_CBRANCH_SCC1::~Inst_SOPP__S_CBRANCH_SCC1()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_SCC1
-
-    // --- description from .arch file ---
-    // if(SCC == 1) then PC = PC + signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_SCC1::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        Addr pc = gpuDynInst->pc();
-        ScalarRegI16 simm16 = instData.SIMM16;
-        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        scc.read();
-
-        if (scc.rawData()) {
-            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
-        }
-
-        wf->pc(pc);
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_VCCZ class methods ---
-
-    Inst_SOPP__S_CBRANCH_VCCZ::Inst_SOPP__S_CBRANCH_VCCZ(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_vccz")
-    {
-        setFlag(Branch);
-        setFlag(ReadsVCC);
-    } // Inst_SOPP__S_CBRANCH_VCCZ
-
-    Inst_SOPP__S_CBRANCH_VCCZ::~Inst_SOPP__S_CBRANCH_VCCZ()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_VCCZ
-
-    // --- description from .arch file ---
-    // if(VCC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_VCCZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-        Addr pc = gpuDynInst->pc();
-        ScalarRegI16 simm16 = instData.SIMM16;
-
-        vcc.read();
-
-        if (!vcc.rawData()) {
-            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
-        }
-
-        wf->pc(pc);
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_VCCNZ class methods ---
-
-    Inst_SOPP__S_CBRANCH_VCCNZ::Inst_SOPP__S_CBRANCH_VCCNZ(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_vccnz")
-    {
-        setFlag(Branch);
-        setFlag(ReadsVCC);
-    } // Inst_SOPP__S_CBRANCH_VCCNZ
-
-    Inst_SOPP__S_CBRANCH_VCCNZ::~Inst_SOPP__S_CBRANCH_VCCNZ()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_VCCNZ
-
-    // --- description from .arch file ---
-    // if(VCC != 0) then PC = PC + signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_VCCNZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        vcc.read();
-
-        if (vcc.rawData()) {
-            Addr pc = gpuDynInst->pc();
-            ScalarRegI16 simm16 = instData.SIMM16;
-            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
-            wf->pc(pc);
-        }
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_EXECZ class methods ---
-
-    Inst_SOPP__S_CBRANCH_EXECZ::Inst_SOPP__S_CBRANCH_EXECZ(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_execz")
-    {
-        setFlag(Branch);
-        setFlag(ReadsEXEC);
-    } // Inst_SOPP__S_CBRANCH_EXECZ
-
-    Inst_SOPP__S_CBRANCH_EXECZ::~Inst_SOPP__S_CBRANCH_EXECZ()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_EXECZ
-
-    // --- description from .arch file ---
-    // if(EXEC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_EXECZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (wf->execMask().none()) {
-            Addr pc = gpuDynInst->pc();
-            ScalarRegI16 simm16 = instData.SIMM16;
-            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
-            wf->pc(pc);
-        }
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_EXECNZ class methods ---
-
-    Inst_SOPP__S_CBRANCH_EXECNZ::Inst_SOPP__S_CBRANCH_EXECNZ(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_execnz")
-    {
-        setFlag(Branch);
-        setFlag(ReadsEXEC);
-    } // Inst_SOPP__S_CBRANCH_EXECNZ
-
-    Inst_SOPP__S_CBRANCH_EXECNZ::~Inst_SOPP__S_CBRANCH_EXECNZ()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_EXECNZ
-
-    // --- description from .arch file ---
-    // if(EXEC != 0) then PC = PC + signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_EXECNZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (wf->execMask().any()) {
-            Addr pc = gpuDynInst->pc();
-            ScalarRegI16 simm16 = instData.SIMM16;
-            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
-            wf->pc(pc);
-        }
-    } // execute
-    // --- Inst_SOPP__S_BARRIER class methods ---
-
-    Inst_SOPP__S_BARRIER::Inst_SOPP__S_BARRIER(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_barrier")
-    {
-        setFlag(MemBarrier);
-    } // Inst_SOPP__S_BARRIER
-
-    Inst_SOPP__S_BARRIER::~Inst_SOPP__S_BARRIER()
-    {
-    } // ~Inst_SOPP__S_BARRIER
-
-    // --- description from .arch file ---
-    // Synchronize waves within a threadgroup.
-    // If not all waves of the threadgroup have been created yet, waits for
-    // entire group before proceeding.
-    // If some waves in the threadgroup have already terminated, this waits on
-    // only the surviving waves.
-    // Barriers are legal inside trap handlers.
-    void
-    Inst_SOPP__S_BARRIER::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ComputeUnit *cu = gpuDynInst->computeUnit();
-
-        if (wf->hasBarrier()) {
-            int bar_id = wf->barrierId();
-            assert(wf->getStatus() == Wavefront::S_BARRIER);
-            cu->incNumAtBarrier(bar_id);
-            DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Stalling at "
-                    "barrier Id%d. %d waves now at barrier, %d waves "
-                    "remain.\n", cu->cu_id, wf->simdId, wf->wfSlotId,
-                    wf->wfDynId, bar_id, cu->numAtBarrier(bar_id),
-                    cu->numYetToReachBarrier(bar_id));
-        }
-    } // execute
-    // --- Inst_SOPP__S_SETKILL class methods ---
-
-    Inst_SOPP__S_SETKILL::Inst_SOPP__S_SETKILL(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_setkill")
-    {
-    } // Inst_SOPP__S_SETKILL
-
-    Inst_SOPP__S_SETKILL::~Inst_SOPP__S_SETKILL()
-    {
-    } // ~Inst_SOPP__S_SETKILL
-
-    // --- description from .arch file ---
-    // set KILL bit to value of SIMM16[0].
-    // Used primarily for debugging kill wave host command behavior.
-    void
-    Inst_SOPP__S_SETKILL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_WAITCNT class methods ---
-
-    Inst_SOPP__S_WAITCNT::Inst_SOPP__S_WAITCNT(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_waitcnt")
-    {
-        setFlag(ALU);
-        setFlag(Waitcnt);
-    } // Inst_SOPP__S_WAITCNT
-
-    Inst_SOPP__S_WAITCNT::~Inst_SOPP__S_WAITCNT()
-    {
-    } // ~Inst_SOPP__S_WAITCNT
-
-    // --- description from .arch file ---
-    // Wait for the counts of outstanding lds, vector-memory and
-    // ---  export/vmem-write-data to be at or below the specified levels.
-    // SIMM16[3:0] = vmcount (vector memory operations),
-    // SIMM16[6:4] = export/mem-write-data count,
-    // SIMM16[12:8] = LGKM_cnt (scalar-mem/GDS/LDS count).
-    void
-    Inst_SOPP__S_WAITCNT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 vm_cnt = 0;
-        ScalarRegI32 exp_cnt = 0;
-        ScalarRegI32 lgkm_cnt = 0;
-        vm_cnt = bits<ScalarRegI16>(instData.SIMM16, 3, 0);
-        exp_cnt = bits<ScalarRegI16>(instData.SIMM16, 6, 4);
-        lgkm_cnt = bits<ScalarRegI16>(instData.SIMM16, 12, 8);
-        gpuDynInst->wavefront()->setStatus(Wavefront::S_WAITCNT);
-        gpuDynInst->wavefront()->setWaitCnts(vm_cnt, exp_cnt, lgkm_cnt);
-    } // execute
-    // --- Inst_SOPP__S_SETHALT class methods ---
-
-    Inst_SOPP__S_SETHALT::Inst_SOPP__S_SETHALT(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_sethalt")
-    {
-    } // Inst_SOPP__S_SETHALT
-
-    Inst_SOPP__S_SETHALT::~Inst_SOPP__S_SETHALT()
-    {
-    } // ~Inst_SOPP__S_SETHALT
-
-    // --- description from .arch file ---
-    // Set HALT bit to value of SIMM16[0]; 1 = halt, 0 = resume.
-    // The halt flag is ignored while PRIV == 1 (inside trap handlers) but the
-    // shader will halt immediately after the handler returns if HALT is still
-    // set at that time.
-    void
-    Inst_SOPP__S_SETHALT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_SLEEP class methods ---
-
-    Inst_SOPP__S_SLEEP::Inst_SOPP__S_SLEEP(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_sleep")
-    {
-        setFlag(ALU);
-        setFlag(Sleep);
-    } // Inst_SOPP__S_SLEEP
-
-    Inst_SOPP__S_SLEEP::~Inst_SOPP__S_SLEEP()
-    {
-    } // ~Inst_SOPP__S_SLEEP
-
-    // --- description from .arch file ---
-    // Cause a wave to sleep for (64 * SIMM16[2:0] + 1..64) clocks.
-    // The exact amount of delay is approximate. Compare with S_NOP.
-    void
-    Inst_SOPP__S_SLEEP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)instData.SIMM16;
-        gpuDynInst->wavefront()->setStatus(Wavefront::S_STALLED_SLEEP);
-        // sleep duration is specified in multiples of 64 cycles
-        gpuDynInst->wavefront()->setSleepTime(64 * simm16);
-    } // execute
-    // --- Inst_SOPP__S_SETPRIO class methods ---
-
-    Inst_SOPP__S_SETPRIO::Inst_SOPP__S_SETPRIO(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_setprio")
-    {
-        setFlag(ALU);
-    } // Inst_SOPP__S_SETPRIO
-
-    Inst_SOPP__S_SETPRIO::~Inst_SOPP__S_SETPRIO()
-    {
-    } // ~Inst_SOPP__S_SETPRIO
-
-    // --- description from .arch file ---
-    // User settable wave priority is set to SIMM16[1:0]. 0 = lowest,
-    // 3 = highest.
-    // The overall wave priority is {SPIPrio[1:0] + UserPrio[1:0],
-    // WaveAge[3:0]}.
-    void
-    Inst_SOPP__S_SETPRIO::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegU16 simm16 = instData.SIMM16;
-        ScalarRegU32 userPrio = simm16 & 0x3;
-
-        warn_once("S_SETPRIO ignored -- Requested priority %d\n", userPrio);
-    } // execute
-    // --- Inst_SOPP__S_SENDMSG class methods ---
-
-    Inst_SOPP__S_SENDMSG::Inst_SOPP__S_SENDMSG(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_sendmsg")
-    {
-    } // Inst_SOPP__S_SENDMSG
-
-    Inst_SOPP__S_SENDMSG::~Inst_SOPP__S_SENDMSG()
-    {
-    } // ~Inst_SOPP__S_SENDMSG
-
-    // --- description from .arch file ---
-    // Send a message upstream to VGT or the interrupt handler.
-    // SIMM16[9:0] contains the message type and is documented in the shader
-    // ---  programming guide.
-    void
-    Inst_SOPP__S_SENDMSG::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_SENDMSGHALT class methods ---
-
-    Inst_SOPP__S_SENDMSGHALT::Inst_SOPP__S_SENDMSGHALT(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_sendmsghalt")
-    {
-    } // Inst_SOPP__S_SENDMSGHALT
-
-    Inst_SOPP__S_SENDMSGHALT::~Inst_SOPP__S_SENDMSGHALT()
-    {
-    } // ~Inst_SOPP__S_SENDMSGHALT
-
-    // --- description from .arch file ---
-    // Send a message and then HALT the wavefront; see S_SENDMSG for details.
-    void
-    Inst_SOPP__S_SENDMSGHALT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_TRAP class methods ---
-
-    Inst_SOPP__S_TRAP::Inst_SOPP__S_TRAP(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_trap")
-    {
-    } // Inst_SOPP__S_TRAP
-
-    Inst_SOPP__S_TRAP::~Inst_SOPP__S_TRAP()
-    {
-    } // ~Inst_SOPP__S_TRAP
-
-    // --- description from .arch file ---
-    // TrapID = SIMM16[7:0];
-    // Wait for all instructions to complete;
-    // set {TTMP1, TTMP0} = {3'h0, PCRewind[3:0], HT[0], TrapID[7:0],
-    // PC[47:0]};
-    // PC = TBA (trap base address);
-    // PRIV = 1.
-    // Enter the trap handler. This instruction may be generated internally as
-    // well in response to a host trap (HT = 1) or an exception.
-    // TrapID 0 is reserved for hardware use and should not be used in a
-    // shader-generated trap.
-    void
-    Inst_SOPP__S_TRAP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_ICACHE_INV class methods ---
-
-    Inst_SOPP__S_ICACHE_INV::Inst_SOPP__S_ICACHE_INV(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_icache_inv")
-    {
-    } // Inst_SOPP__S_ICACHE_INV
-
-    Inst_SOPP__S_ICACHE_INV::~Inst_SOPP__S_ICACHE_INV()
-    {
-    } // ~Inst_SOPP__S_ICACHE_INV
-
-    // --- description from .arch file ---
-    // Invalidate entire L1 instruction cache.
-    // You must have 12 separate S_NOP instructions or a jump/branch
-    // instruction after this instruction
-    // to ensure the SQ instruction buffer is purged.
-    void
-    Inst_SOPP__S_ICACHE_INV::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_INCPERFLEVEL class methods ---
-
-    Inst_SOPP__S_INCPERFLEVEL::Inst_SOPP__S_INCPERFLEVEL(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_incperflevel")
-    {
-    } // Inst_SOPP__S_INCPERFLEVEL
-
-    Inst_SOPP__S_INCPERFLEVEL::~Inst_SOPP__S_INCPERFLEVEL()
-    {
-    } // ~Inst_SOPP__S_INCPERFLEVEL
-
-    // --- description from .arch file ---
-    // Increment performance counter specified in SIMM16[3:0] by 1.
-    void
-    Inst_SOPP__S_INCPERFLEVEL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_DECPERFLEVEL class methods ---
-
-    Inst_SOPP__S_DECPERFLEVEL::Inst_SOPP__S_DECPERFLEVEL(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_decperflevel")
-    {
-    } // Inst_SOPP__S_DECPERFLEVEL
-
-    Inst_SOPP__S_DECPERFLEVEL::~Inst_SOPP__S_DECPERFLEVEL()
-    {
-    } // ~Inst_SOPP__S_DECPERFLEVEL
-
-    // --- description from .arch file ---
-    // Decrement performance counter specified in SIMM16[3:0] by 1.
-    void
-    Inst_SOPP__S_DECPERFLEVEL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_TTRACEDATA class methods ---
-
-    Inst_SOPP__S_TTRACEDATA::Inst_SOPP__S_TTRACEDATA(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_ttracedata")
-    {
-    } // Inst_SOPP__S_TTRACEDATA
-
-    Inst_SOPP__S_TTRACEDATA::~Inst_SOPP__S_TTRACEDATA()
-    {
-    } // ~Inst_SOPP__S_TTRACEDATA
-
-    // --- description from .arch file ---
-    // Send M0 as user data to the thread trace stream.
-    void
-    Inst_SOPP__S_TTRACEDATA::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_CDBGSYS class methods ---
-
-    Inst_SOPP__S_CBRANCH_CDBGSYS::Inst_SOPP__S_CBRANCH_CDBGSYS(
-          InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_cdbgsys")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_CBRANCH_CDBGSYS
-
-    Inst_SOPP__S_CBRANCH_CDBGSYS::~Inst_SOPP__S_CBRANCH_CDBGSYS()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_CDBGSYS
-
-    // --- description from .arch file ---
-    // if(conditional_debug_system != 0) then PC = PC + signext(SIMM16 * 4)
-    // + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_CDBGSYS::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_CDBGUSER class methods ---
-
-    Inst_SOPP__S_CBRANCH_CDBGUSER::Inst_SOPP__S_CBRANCH_CDBGUSER(
-          InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_cdbguser")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_CBRANCH_CDBGUSER
-
-    Inst_SOPP__S_CBRANCH_CDBGUSER::~Inst_SOPP__S_CBRANCH_CDBGUSER()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_CDBGUSER
-
-    // --- description from .arch file ---
-    // if(conditional_debug_user != 0) then PC = PC + signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_CDBGUSER::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER class methods ---
-
-    Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER(
-          InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_cdbgsys_or_user")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER
-
-    Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::
-        ~Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER
-
-    // --- description from .arch file ---
-    // if(conditional_debug_system || conditional_debug_user) then PC = PC +
-    // ---  signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER class methods ---
-
-    Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::
-        Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER(InFmt_SOPP *iFmt)
-            : Inst_SOPP(iFmt, "s_cbranch_cdbgsys_and_user")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER
-
-    Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::
-        ~Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER
-
-    // --- description from .arch file ---
-    // if(conditional_debug_system && conditional_debug_user) then PC = PC +
-    // ---  signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_ENDPGM_SAVED class methods ---
-
-    Inst_SOPP__S_ENDPGM_SAVED::Inst_SOPP__S_ENDPGM_SAVED(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_endpgm_saved")
-    {
-    } // Inst_SOPP__S_ENDPGM_SAVED
-
-    Inst_SOPP__S_ENDPGM_SAVED::~Inst_SOPP__S_ENDPGM_SAVED()
-    {
-    } // ~Inst_SOPP__S_ENDPGM_SAVED
-
-    // --- description from .arch file ---
-    // End of program; signal that a wave has been saved by the context-switch
-    // trap handler and terminate wavefront.
-    // The hardware implicitly executes S_WAITCNT 0 before executing this
-    // instruction.
-    // Use S_ENDPGM in all cases unless you are executing the context-switch
-    // save handler.
-    void
-    Inst_SOPP__S_ENDPGM_SAVED::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_SET_GPR_IDX_OFF class methods ---
-
-    Inst_SOPP__S_SET_GPR_IDX_OFF::Inst_SOPP__S_SET_GPR_IDX_OFF(
-          InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_set_gpr_idx_off")
-    {
-    } // Inst_SOPP__S_SET_GPR_IDX_OFF
-
-    Inst_SOPP__S_SET_GPR_IDX_OFF::~Inst_SOPP__S_SET_GPR_IDX_OFF()
-    {
-    } // ~Inst_SOPP__S_SET_GPR_IDX_OFF
-
-    // --- description from .arch file ---
-    // MODE.gpr_idx_en = 0.
-    // Clear GPR indexing mode. Vector operations after this will not perform
-    // ---  relative GPR addressing regardless of the contents of M0. This
-    // ---  instruction does not modify M0.
-    void
-    Inst_SOPP__S_SET_GPR_IDX_OFF::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_SET_GPR_IDX_MODE class methods ---
-
-    Inst_SOPP__S_SET_GPR_IDX_MODE::Inst_SOPP__S_SET_GPR_IDX_MODE(
-          InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_set_gpr_idx_mode")
-    {
-    } // Inst_SOPP__S_SET_GPR_IDX_MODE
-
-    Inst_SOPP__S_SET_GPR_IDX_MODE::~Inst_SOPP__S_SET_GPR_IDX_MODE()
-    {
-    } // ~Inst_SOPP__S_SET_GPR_IDX_MODE
-
-    // --- description from .arch file ---
-    // M0[15:12] = SIMM4.
-    // Modify the mode used for vector GPR indexing.
-    // The raw contents of the source field are read and used to set the enable
-    // bits. SIMM4[0] = VSRC0_REL, SIMM4[1] = VSRC1_REL, SIMM4[2] = VSRC2_REL
-    // and SIMM4[3] = VDST_REL.
-    void
-    Inst_SOPP__S_SET_GPR_IDX_MODE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SMEM__S_LOAD_DWORD class methods ---
-
-    Inst_SMEM__S_LOAD_DWORD::Inst_SMEM__S_LOAD_DWORD(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_load_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_LOAD_DWORD
-
-    Inst_SMEM__S_LOAD_DWORD::~Inst_SMEM__S_LOAD_DWORD()
-    {
-    } // ~Inst_SMEM__S_LOAD_DWORD
-
-    /**
-     * Read 1 dword from scalar data cache. If the offset is specified as an
-     * sgpr, the sgpr contains an unsigned byte offset (the 2 LSBs are
-     * ignored). If the offset is specified as an immediate 20-bit constant,
-     * the constant is an unsigned byte offset.
-     */
-    void
-    Inst_SMEM__S_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-
-        addr.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe
-            .issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<1>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_LOAD_DWORDX2 class methods ---
-
-    Inst_SMEM__S_LOAD_DWORDX2::Inst_SMEM__S_LOAD_DWORDX2(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_load_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_LOAD_DWORDX2
-
-    Inst_SMEM__S_LOAD_DWORDX2::~Inst_SMEM__S_LOAD_DWORDX2()
-    {
-    } // ~Inst_SMEM__S_LOAD_DWORDX2
-
-    /**
-     * Read 2 dwords from scalar data cache. See s_load_dword for details on
-     * the offset input.
-     */
-    void
-    Inst_SMEM__S_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-
-        addr.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe.
-            issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<2>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_LOAD_DWORDX4 class methods ---
-
-    Inst_SMEM__S_LOAD_DWORDX4::Inst_SMEM__S_LOAD_DWORDX4(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_load_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_LOAD_DWORDX4
-
-    Inst_SMEM__S_LOAD_DWORDX4::~Inst_SMEM__S_LOAD_DWORDX4()
-    {
-    } // ~Inst_SMEM__S_LOAD_DWORDX4
-
-    // --- description from .arch file ---
-    // Read 4 dwords from scalar data cache. See S_LOAD_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-
-        addr.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe.
-            issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<4>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarOperandU128 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_LOAD_DWORDX8 class methods ---
-
-    Inst_SMEM__S_LOAD_DWORDX8::Inst_SMEM__S_LOAD_DWORDX8(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_load_dwordx8")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_LOAD_DWORDX8
-
-    Inst_SMEM__S_LOAD_DWORDX8::~Inst_SMEM__S_LOAD_DWORDX8()
-    {
-    } // ~Inst_SMEM__S_LOAD_DWORDX8
-
-    // --- description from .arch file ---
-    // Read 8 dwords from scalar data cache. See S_LOAD_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_LOAD_DWORDX8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-
-        addr.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe.
-            issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX8::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<8>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX8::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarOperandU256 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_LOAD_DWORDX16 class methods ---
-
-    Inst_SMEM__S_LOAD_DWORDX16::Inst_SMEM__S_LOAD_DWORDX16(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_load_dwordx16")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_LOAD_DWORDX16
-
-    Inst_SMEM__S_LOAD_DWORDX16::~Inst_SMEM__S_LOAD_DWORDX16()
-    {
-    } // ~Inst_SMEM__S_LOAD_DWORDX16
-
-    // --- description from .arch file ---
-    // Read 16 dwords from scalar data cache. See S_LOAD_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_LOAD_DWORDX16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-
-        addr.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe.
-            issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX16::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<16>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX16::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarOperandU512 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_BUFFER_LOAD_DWORD class methods ---
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORD::Inst_SMEM__S_BUFFER_LOAD_DWORD(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_load_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_BUFFER_LOAD_DWORD
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORD::~Inst_SMEM__S_BUFFER_LOAD_DWORD()
-    {
-    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORD
-
-    // --- description from .arch file ---
-    // Read 1 dword from scalar data cache. See S_LOAD_DWORD for details on the
-    // ---  offset input.
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
-
-        rsrcDesc.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, rsrcDesc, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe
-            .issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<1>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        // 1 request, size 32
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_BUFFER_LOAD_DWORDX2 class methods ---
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::Inst_SMEM__S_BUFFER_LOAD_DWORDX2(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_load_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX2
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::~Inst_SMEM__S_BUFFER_LOAD_DWORDX2()
-    {
-    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX2
-
-    // --- description from .arch file ---
-    // Read 2 dwords from scalar data cache. See S_LOAD_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
-
-        rsrcDesc.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, rsrcDesc, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe
-            .issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<2>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        // use U64 because 2 requests, each size 32
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_BUFFER_LOAD_DWORDX4 class methods ---
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::Inst_SMEM__S_BUFFER_LOAD_DWORDX4(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_load_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX4
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::~Inst_SMEM__S_BUFFER_LOAD_DWORDX4()
-    {
-    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX4
-
-    // --- description from .arch file ---
-    // Read 4 dwords from scalar data cache. See S_LOAD_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
-
-        rsrcDesc.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, rsrcDesc, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe
-            .issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<4>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        // 4 requests, each size 32
-        ScalarOperandU128 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_BUFFER_LOAD_DWORDX8 class methods ---
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::Inst_SMEM__S_BUFFER_LOAD_DWORDX8(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_load_dwordx8")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX8
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::~Inst_SMEM__S_BUFFER_LOAD_DWORDX8()
-    {
-    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX8
-
-    // --- description from .arch file ---
-    // Read 8 dwords from scalar data cache. See S_LOAD_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
-
-        rsrcDesc.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, rsrcDesc, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe
-            .issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<8>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        // 8 requests, each size 32
-        ScalarOperandU256 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_BUFFER_LOAD_DWORDX16 class methods ---
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::Inst_SMEM__S_BUFFER_LOAD_DWORDX16(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_load_dwordx16")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX16
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::~Inst_SMEM__S_BUFFER_LOAD_DWORDX16()
-    {
-    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX16
-
-    // --- description from .arch file ---
-    // Read 16 dwords from scalar data cache. See S_LOAD_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
-
-        rsrcDesc.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, rsrcDesc, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe
-            .issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<16>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        // 16 requests, each size 32
-        ScalarOperandU512 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_STORE_DWORD class methods ---
-
-    Inst_SMEM__S_STORE_DWORD::Inst_SMEM__S_STORE_DWORD(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_store_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_SMEM__S_STORE_DWORD
-
-    Inst_SMEM__S_STORE_DWORD::~Inst_SMEM__S_STORE_DWORD()
-    {
-    } // ~Inst_SMEM__S_STORE_DWORD
-
-    // --- description from .arch file ---
-    // Write 1 dword to scalar data cache.
-    // If the offset is specified as an SGPR, the SGPR contains an unsigned
-    // BYTE offset (the 2 LSBs are ignored).
-    // If the offset is specified as an immediate 20-bit constant, the
-    // constant is an unsigned BYTE offset.
-    void
-    Inst_SMEM__S_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-        ConstScalarOperandU32 sdata(gpuDynInst, instData.SDATA);
-
-        addr.read();
-        sdata.read();
-
-        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
-            sizeof(ScalarRegU32));
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe.
-            issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<1>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_SMEM__S_STORE_DWORDX2 class methods ---
-
-    Inst_SMEM__S_STORE_DWORDX2::Inst_SMEM__S_STORE_DWORDX2(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_store_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_SMEM__S_STORE_DWORDX2
-
-    Inst_SMEM__S_STORE_DWORDX2::~Inst_SMEM__S_STORE_DWORDX2()
-    {
-    } // ~Inst_SMEM__S_STORE_DWORDX2
-
-    // --- description from .arch file ---
-    // Write 2 dwords to scalar data cache. See S_STORE_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-        ConstScalarOperandU64 sdata(gpuDynInst, instData.SDATA);
-
-        addr.read();
-        sdata.read();
-
-        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
-            sizeof(ScalarRegU64));
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe.
-            issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<2>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_SMEM__S_STORE_DWORDX4 class methods ---
-
-    Inst_SMEM__S_STORE_DWORDX4::Inst_SMEM__S_STORE_DWORDX4(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_store_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_SMEM__S_STORE_DWORDX4
-
-    Inst_SMEM__S_STORE_DWORDX4::~Inst_SMEM__S_STORE_DWORDX4()
-    {
-    } // ~Inst_SMEM__S_STORE_DWORDX4
-
-    // --- description from .arch file ---
-    // Write 4 dwords to scalar data cache. See S_STORE_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-        ConstScalarOperandU64 sdata(gpuDynInst, instData.SDATA);
-
-        addr.read();
-        sdata.read();
-
-        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
-            sizeof(gpuDynInst->scalar_data));
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe.
-            issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<4>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_SMEM__S_BUFFER_STORE_DWORD class methods ---
-
-    Inst_SMEM__S_BUFFER_STORE_DWORD::Inst_SMEM__S_BUFFER_STORE_DWORD(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_store_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_SMEM__S_BUFFER_STORE_DWORD
-
-    Inst_SMEM__S_BUFFER_STORE_DWORD::~Inst_SMEM__S_BUFFER_STORE_DWORD()
-    {
-    } // ~Inst_SMEM__S_BUFFER_STORE_DWORD
-
-    // --- description from .arch file ---
-    // Write 1 dword to scalar data cache. See S_STORE_DWORD for details on the
-    // ---  offset input.
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_SMEM__S_BUFFER_STORE_DWORDX2 class methods ---
-
-    Inst_SMEM__S_BUFFER_STORE_DWORDX2::Inst_SMEM__S_BUFFER_STORE_DWORDX2(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_store_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_SMEM__S_BUFFER_STORE_DWORDX2
-
-    Inst_SMEM__S_BUFFER_STORE_DWORDX2::~Inst_SMEM__S_BUFFER_STORE_DWORDX2()
-    {
-    } // ~Inst_SMEM__S_BUFFER_STORE_DWORDX2
-
-    // --- description from .arch file ---
-    // Write 2 dwords to scalar data cache. See S_STORE_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_SMEM__S_BUFFER_STORE_DWORDX4 class methods ---
-
-    Inst_SMEM__S_BUFFER_STORE_DWORDX4::Inst_SMEM__S_BUFFER_STORE_DWORDX4(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_store_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_SMEM__S_BUFFER_STORE_DWORDX4
-
-    Inst_SMEM__S_BUFFER_STORE_DWORDX4::~Inst_SMEM__S_BUFFER_STORE_DWORDX4()
-    {
-    } // ~Inst_SMEM__S_BUFFER_STORE_DWORDX4
-
-    // --- description from .arch file ---
-    // Write 4 dwords to scalar data cache. See S_STORE_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_SMEM__S_DCACHE_INV class methods ---
-
-    Inst_SMEM__S_DCACHE_INV::Inst_SMEM__S_DCACHE_INV(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_dcache_inv")
-    {
-    } // Inst_SMEM__S_DCACHE_INV
-
-    Inst_SMEM__S_DCACHE_INV::~Inst_SMEM__S_DCACHE_INV()
-    {
-    } // ~Inst_SMEM__S_DCACHE_INV
-
-    // --- description from .arch file ---
-    // Invalidate the scalar data cache.
-    void
-    Inst_SMEM__S_DCACHE_INV::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SMEM__S_DCACHE_WB class methods ---
-
-    Inst_SMEM__S_DCACHE_WB::Inst_SMEM__S_DCACHE_WB(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_dcache_wb")
-    {
-    } // Inst_SMEM__S_DCACHE_WB
-
-    Inst_SMEM__S_DCACHE_WB::~Inst_SMEM__S_DCACHE_WB()
-    {
-    } // ~Inst_SMEM__S_DCACHE_WB
-
-    // --- description from .arch file ---
-    // Write back dirty data in the scalar data cache.
-    void
-    Inst_SMEM__S_DCACHE_WB::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SMEM__S_DCACHE_INV_VOL class methods ---
-
-    Inst_SMEM__S_DCACHE_INV_VOL::Inst_SMEM__S_DCACHE_INV_VOL(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_dcache_inv_vol")
-    {
-    } // Inst_SMEM__S_DCACHE_INV_VOL
-
-    Inst_SMEM__S_DCACHE_INV_VOL::~Inst_SMEM__S_DCACHE_INV_VOL()
-    {
-    } // ~Inst_SMEM__S_DCACHE_INV_VOL
-
-    // --- description from .arch file ---
-    // Invalidate the scalar data cache volatile lines.
-    void
-    Inst_SMEM__S_DCACHE_INV_VOL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SMEM__S_DCACHE_WB_VOL class methods ---
-
-    Inst_SMEM__S_DCACHE_WB_VOL::Inst_SMEM__S_DCACHE_WB_VOL(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_dcache_wb_vol")
-    {
-    } // Inst_SMEM__S_DCACHE_WB_VOL
-
-    Inst_SMEM__S_DCACHE_WB_VOL::~Inst_SMEM__S_DCACHE_WB_VOL()
-    {
-    } // ~Inst_SMEM__S_DCACHE_WB_VOL
-
-    // --- description from .arch file ---
-    // Write back dirty data in the scalar data cache volatile lines.
-    void
-    Inst_SMEM__S_DCACHE_WB_VOL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SMEM__S_MEMTIME class methods ---
-
-    Inst_SMEM__S_MEMTIME::Inst_SMEM__S_MEMTIME(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_memtime")
-    {
-        // s_memtime does not issue a memory request
-        setFlag(ALU);
-    } // Inst_SMEM__S_MEMTIME
-
-    Inst_SMEM__S_MEMTIME::~Inst_SMEM__S_MEMTIME()
-    {
-    } // ~Inst_SMEM__S_MEMTIME
-
-    // --- description from .arch file ---
-    // Return current 64-bit timestamp.
-    void
-    Inst_SMEM__S_MEMTIME::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDATA);
-        sdst = (ScalarRegU64)gpuDynInst->computeUnit()->curCycle();
-        sdst.write();
-    } // execute
-    // --- Inst_SMEM__S_MEMREALTIME class methods ---
-
-    Inst_SMEM__S_MEMREALTIME::Inst_SMEM__S_MEMREALTIME(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_memrealtime")
-    {
-    } // Inst_SMEM__S_MEMREALTIME
-
-    Inst_SMEM__S_MEMREALTIME::~Inst_SMEM__S_MEMREALTIME()
-    {
-    } // ~Inst_SMEM__S_MEMREALTIME
-
-    // --- description from .arch file ---
-    // Return current 64-bit RTC.
-    void
-    Inst_SMEM__S_MEMREALTIME::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SMEM__S_ATC_PROBE class methods ---
-
-    Inst_SMEM__S_ATC_PROBE::Inst_SMEM__S_ATC_PROBE(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_atc_probe")
-    {
-    } // Inst_SMEM__S_ATC_PROBE
-
-    Inst_SMEM__S_ATC_PROBE::~Inst_SMEM__S_ATC_PROBE()
-    {
-    } // ~Inst_SMEM__S_ATC_PROBE
-
-    // --- description from .arch file ---
-    // Probe or prefetch an address into the SQC data cache.
-    void
-    Inst_SMEM__S_ATC_PROBE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SMEM__S_ATC_PROBE_BUFFER class methods ---
-
-    Inst_SMEM__S_ATC_PROBE_BUFFER::Inst_SMEM__S_ATC_PROBE_BUFFER(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_atc_probe_buffer")
-    {
-    } // Inst_SMEM__S_ATC_PROBE_BUFFER
-
-    Inst_SMEM__S_ATC_PROBE_BUFFER::~Inst_SMEM__S_ATC_PROBE_BUFFER()
-    {
-    } // ~Inst_SMEM__S_ATC_PROBE_BUFFER
-
-    // --- description from .arch file ---
-    // Probe or prefetch an address into the SQC data cache.
-    void
-    Inst_SMEM__S_ATC_PROBE_BUFFER::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_CNDMASK_B32 class methods ---
-
-    Inst_VOP2__V_CNDMASK_B32::Inst_VOP2__V_CNDMASK_B32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_cndmask_b32")
-    {
-        setFlag(ALU);
-        setFlag(ReadsVCC);
-    } // Inst_VOP2__V_CNDMASK_B32
-
-    Inst_VOP2__V_CNDMASK_B32::~Inst_VOP2__V_CNDMASK_B32()
-    {
-    } // ~Inst_VOP2__V_CNDMASK_B32
-
-    // --- description from .arch file ---
-    // D.u = (VCC[i] ? S1.u : S0.u) (i = threadID in wave); VOP3: specify VCC
-    // as a scalar GPR in S2.
-    void
-    Inst_VOP2__V_CNDMASK_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-        vcc.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane]
-                    = bits(vcc.rawData(), lane) ? src1[lane] : src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_ADD_F32 class methods ---
-
-    Inst_VOP2__V_ADD_F32::Inst_VOP2__V_ADD_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_add_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP2__V_ADD_F32
-
-    Inst_VOP2__V_ADD_F32::~Inst_VOP2__V_ADD_F32()
-    {
-    } // ~Inst_VOP2__V_ADD_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f + S1.f.
-    void
-    Inst_VOP2__V_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        VecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        if (isDPPInst()) {
-            VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
-            src0_dpp.read();
-
-            DPRINTF(VEGA, "Handling V_ADD_F32 SRC DPP. SRC0: register v[%d], "
-                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
-                    "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
-                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
-                    extData.iFmt_VOP_DPP.DPP_CTRL,
-                    extData.iFmt_VOP_DPP.SRC0_ABS,
-                    extData.iFmt_VOP_DPP.SRC0_NEG,
-                    extData.iFmt_VOP_DPP.SRC1_ABS,
-                    extData.iFmt_VOP_DPP.SRC1_NEG,
-                    extData.iFmt_VOP_DPP.BC,
-                    extData.iFmt_VOP_DPP.BANK_MASK,
-                    extData.iFmt_VOP_DPP.ROW_MASK);
-
-            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0_dpp[lane] + src1[lane];
-                }
-            }
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0[lane] + src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_SUB_F32 class methods ---
-
-    Inst_VOP2__V_SUB_F32::Inst_VOP2__V_SUB_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_sub_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP2__V_SUB_F32
-
-    Inst_VOP2__V_SUB_F32::~Inst_VOP2__V_SUB_F32()
-    {
-    } // ~Inst_VOP2__V_SUB_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f - S1.f.
-    // SQ translates to V_ADD_F32.
-    void
-    Inst_VOP2__V_SUB_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_SUBREV_F32 class methods ---
-
-    Inst_VOP2__V_SUBREV_F32::Inst_VOP2__V_SUBREV_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subrev_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP2__V_SUBREV_F32
-
-    Inst_VOP2__V_SUBREV_F32::~Inst_VOP2__V_SUBREV_F32()
-    {
-    } // ~Inst_VOP2__V_SUBREV_F32
-
-    // --- description from .arch file ---
-    // D.f = S1.f - S0.f.
-    // SQ translates to V_ADD_F32.
-    void
-    Inst_VOP2__V_SUBREV_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MUL_LEGACY_F32 class methods ---
-
-    Inst_VOP2__V_MUL_LEGACY_F32::Inst_VOP2__V_MUL_LEGACY_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_legacy_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP2__V_MUL_LEGACY_F32
-
-    Inst_VOP2__V_MUL_LEGACY_F32::~Inst_VOP2__V_MUL_LEGACY_F32()
-    {
-    } // ~Inst_VOP2__V_MUL_LEGACY_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f (DX9 rules, 0.0*x = 0.0).
-    void
-    Inst_VOP2__V_MUL_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] * src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MUL_F32 class methods ---
-
-    Inst_VOP2__V_MUL_F32::Inst_VOP2__V_MUL_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP2__V_MUL_F32
-
-    Inst_VOP2__V_MUL_F32::~Inst_VOP2__V_MUL_F32()
-    {
-    } // ~Inst_VOP2__V_MUL_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f.
-    void
-    Inst_VOP2__V_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isnan(src0[lane]) ||
-                    std::isnan(src1[lane])) {
-                    vdst[lane] = NAN;
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else {
-                    vdst[lane] = src0[lane] * src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MUL_I32_I24 class methods ---
-
-    Inst_VOP2__V_MUL_I32_I24::Inst_VOP2__V_MUL_I32_I24(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_i32_i24")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MUL_I32_I24
-
-    Inst_VOP2__V_MUL_I32_I24::~Inst_VOP2__V_MUL_I32_I24()
-    {
-    } // ~Inst_VOP2__V_MUL_I32_I24
-
-    // --- description from .arch file ---
-    // D.i = S0.i[23:0] * S1.i[23:0].
-    void
-    Inst_VOP2__V_MUL_I32_I24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
-                    * sext<24>(bits(src1[lane], 23, 0));
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MUL_HI_I32_I24 class methods ---
-
-    Inst_VOP2__V_MUL_HI_I32_I24::Inst_VOP2__V_MUL_HI_I32_I24(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_hi_i32_i24")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MUL_HI_I32_I24
-
-    Inst_VOP2__V_MUL_HI_I32_I24::~Inst_VOP2__V_MUL_HI_I32_I24()
-    {
-    } // ~Inst_VOP2__V_MUL_HI_I32_I24
-
-    // --- description from .arch file ---
-    // D.i = (S0.i[23:0] * S1.i[23:0])>>32.
-    void
-    Inst_VOP2__V_MUL_HI_I32_I24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI64 tmp_src0
-                    = (VecElemI64)sext<24>(bits(src0[lane], 23, 0));
-                VecElemI64 tmp_src1
-                    = (VecElemI64)sext<24>(bits(src1[lane], 23, 0));
-
-                vdst[lane] = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MUL_U32_U24 class methods ---
-
-    Inst_VOP2__V_MUL_U32_U24::Inst_VOP2__V_MUL_U32_U24(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_u32_u24")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MUL_U32_U24
-
-    Inst_VOP2__V_MUL_U32_U24::~Inst_VOP2__V_MUL_U32_U24()
-    {
-    } // ~Inst_VOP2__V_MUL_U32_U24
-
-    // --- description from .arch file ---
-    // D.u = S0.u[23:0] * S1.u[23:0].
-    void
-    Inst_VOP2__V_MUL_U32_U24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        auto opImpl = [](VecOperandU32& src0, VecOperandU32& src1,
-                         VecOperandU32& vdst, Wavefront* wf) {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = bits(src0[lane], 23, 0) *
-                                 bits(src1[lane], 23, 0);
-                }
-            }
-        };
-
-        vop2Helper<ConstVecOperandU32, VecOperandU32>(gpuDynInst, opImpl);
-    } // execute
-    // --- Inst_VOP2__V_MUL_HI_U32_U24 class methods ---
-
-    Inst_VOP2__V_MUL_HI_U32_U24::Inst_VOP2__V_MUL_HI_U32_U24(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_hi_u32_u24")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MUL_HI_U32_U24
-
-    Inst_VOP2__V_MUL_HI_U32_U24::~Inst_VOP2__V_MUL_HI_U32_U24()
-    {
-    } // ~Inst_VOP2__V_MUL_HI_U32_U24
-
-    // --- description from .arch file ---
-    // D.i = (S0.u[23:0] * S1.u[23:0])>>32.
-    void
-    Inst_VOP2__V_MUL_HI_U32_U24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemU64 tmp_src0 = (VecElemU64)bits(src0[lane], 23, 0);
-                VecElemU64 tmp_src1 = (VecElemU64)bits(src1[lane], 23, 0);
-                vdst[lane] = (VecElemU32)((tmp_src0 * tmp_src1) >> 32);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MIN_F32 class methods ---
-
-    Inst_VOP2__V_MIN_F32::Inst_VOP2__V_MIN_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_min_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP2__V_MIN_F32
-
-    Inst_VOP2__V_MIN_F32::~Inst_VOP2__V_MIN_F32()
-    {
-    } // ~Inst_VOP2__V_MIN_F32
-
-    // --- description from .arch file ---
-    // D.f = (S0.f < S1.f ? S0.f : S1.f).
-    void
-    Inst_VOP2__V_MIN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fmin(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MAX_F32 class methods ---
-
-    Inst_VOP2__V_MAX_F32::Inst_VOP2__V_MAX_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_max_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP2__V_MAX_F32
-
-    Inst_VOP2__V_MAX_F32::~Inst_VOP2__V_MAX_F32()
-    {
-    } // ~Inst_VOP2__V_MAX_F32
-
-    // --- description from .arch file ---
-    // D.f = (S0.f >= S1.f ? S0.f : S1.f).
-    void
-    Inst_VOP2__V_MAX_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fmax(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MIN_I32 class methods ---
-
-    Inst_VOP2__V_MIN_I32::Inst_VOP2__V_MIN_I32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_min_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MIN_I32
-
-    Inst_VOP2__V_MIN_I32::~Inst_VOP2__V_MIN_I32()
-    {
-    } // ~Inst_VOP2__V_MIN_I32
-
-    // --- description from .arch file ---
-    // D.i = min(S0.i, S1.i).
-    void
-    Inst_VOP2__V_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MAX_I32 class methods ---
-
-    Inst_VOP2__V_MAX_I32::Inst_VOP2__V_MAX_I32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_max_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MAX_I32
-
-    Inst_VOP2__V_MAX_I32::~Inst_VOP2__V_MAX_I32()
-    {
-    } // ~Inst_VOP2__V_MAX_I32
-
-    // --- description from .arch file ---
-    // D.i = max(S0.i, S1.i).
-    void
-    Inst_VOP2__V_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MIN_U32 class methods ---
-
-    Inst_VOP2__V_MIN_U32::Inst_VOP2__V_MIN_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_min_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MIN_U32
-
-    Inst_VOP2__V_MIN_U32::~Inst_VOP2__V_MIN_U32()
-    {
-    } // ~Inst_VOP2__V_MIN_U32
-
-    // --- description from .arch file ---
-    // D.u = min(S0.u, S1.u).
-    void
-    Inst_VOP2__V_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MAX_U32 class methods ---
-
-    Inst_VOP2__V_MAX_U32::Inst_VOP2__V_MAX_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_max_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MAX_U32
-
-    Inst_VOP2__V_MAX_U32::~Inst_VOP2__V_MAX_U32()
-    {
-    } // ~Inst_VOP2__V_MAX_U32
-
-    // --- description from .arch file ---
-    // D.u = max(S0.u, S1.u).
-    void
-    Inst_VOP2__V_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_LSHRREV_B32 class methods ---
-
-    Inst_VOP2__V_LSHRREV_B32::Inst_VOP2__V_LSHRREV_B32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_lshrrev_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_LSHRREV_B32
-
-    Inst_VOP2__V_LSHRREV_B32::~Inst_VOP2__V_LSHRREV_B32()
-    {
-    } // ~Inst_VOP2__V_LSHRREV_B32
-
-    // --- description from .arch file ---
-    // D.u = S1.u >> S0.u[4:0].
-    // The vacated bits are set to zero.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP2__V_LSHRREV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_ASHRREV_I32 class methods ---
-
-    Inst_VOP2__V_ASHRREV_I32::Inst_VOP2__V_ASHRREV_I32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_ashrrev_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_ASHRREV_I32
-
-    Inst_VOP2__V_ASHRREV_I32::~Inst_VOP2__V_ASHRREV_I32()
-    {
-    } // ~Inst_VOP2__V_ASHRREV_I32
-
-    // --- description from .arch file ---
-    // D.i = signext(S1.i) >> S0.i[4:0].
-    // The vacated bits are set to the sign bit of the input value.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP2__V_ASHRREV_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_LSHLREV_B32 class methods ---
-
-    Inst_VOP2__V_LSHLREV_B32::Inst_VOP2__V_LSHLREV_B32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_lshlrev_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_LSHLREV_B32
-
-    Inst_VOP2__V_LSHLREV_B32::~Inst_VOP2__V_LSHLREV_B32()
-    {
-    } // ~Inst_VOP2__V_LSHLREV_B32
-
-    // --- description from .arch file ---
-    // D.u = S1.u << S0.u[4:0].
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP2__V_LSHLREV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        if (isSDWAInst()) {
-            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
-            // use copies of original src0, src1, and vdst during selecting
-            VecOperandU32 origSrc0_sdwa(gpuDynInst,
-                                        extData.iFmt_VOP_SDWA.SRC0);
-            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
-            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
-
-            src0_sdwa.read();
-            origSrc0_sdwa.read();
-            origSrc1.read();
-
-            DPRINTF(VEGA, "Handling V_LSHLREV_B32 SRC SDWA. SRC0: register "
-                    "v[%d], DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: "
-                    "%d, SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: "
-                    "%d, SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
-                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
-                    extData.iFmt_VOP_SDWA.DST_U,
-                    extData.iFmt_VOP_SDWA.CLMP,
-                    extData.iFmt_VOP_SDWA.SRC0_SEL,
-                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC0_NEG,
-                    extData.iFmt_VOP_SDWA.SRC0_ABS,
-                    extData.iFmt_VOP_SDWA.SRC1_SEL,
-                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC1_NEG,
-                    extData.iFmt_VOP_SDWA.SRC1_ABS);
-
-            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
-                            src1, origSrc1);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src1[lane] << bits(src0_sdwa[lane], 4, 0);
-                    origVdst[lane] = vdst[lane]; // keep copy consistent
-                }
-            }
-
-            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_AND_B32 class methods ---
-
-    Inst_VOP2__V_AND_B32::Inst_VOP2__V_AND_B32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_and_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_AND_B32
-
-    Inst_VOP2__V_AND_B32::~Inst_VOP2__V_AND_B32()
-    {
-    } // ~Inst_VOP2__V_AND_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u & S1.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP2__V_AND_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        if (isDPPInst()) {
-            VecOperandU32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
-            src0_dpp.read();
-
-            DPRINTF(VEGA, "Handling V_AND_B32 SRC DPP. SRC0: register v[%d], "
-                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
-                    "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
-                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
-                    extData.iFmt_VOP_DPP.DPP_CTRL,
-                    extData.iFmt_VOP_DPP.SRC0_ABS,
-                    extData.iFmt_VOP_DPP.SRC0_NEG,
-                    extData.iFmt_VOP_DPP.SRC1_ABS,
-                    extData.iFmt_VOP_DPP.SRC1_NEG,
-                    extData.iFmt_VOP_DPP.BC,
-                    extData.iFmt_VOP_DPP.BANK_MASK,
-                    extData.iFmt_VOP_DPP.ROW_MASK);
-
-            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0_dpp[lane] & src1[lane];
-                }
-            }
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0[lane] & src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_OR_B32 class methods ---
-
-    Inst_VOP2__V_OR_B32::Inst_VOP2__V_OR_B32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_or_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_OR_B32
-
-    Inst_VOP2__V_OR_B32::~Inst_VOP2__V_OR_B32()
-    {
-    } // ~Inst_VOP2__V_OR_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u | S1.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP2__V_OR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        if (isSDWAInst()) {
-            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
-            // use copies of original src0, src1, and dest during selecting
-            VecOperandU32 origSrc0_sdwa(gpuDynInst,
-                                        extData.iFmt_VOP_SDWA.SRC0);
-            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
-            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
-
-            src0_sdwa.read();
-            origSrc0_sdwa.read();
-            origSrc1.read();
-
-            DPRINTF(VEGA, "Handling V_OR_B32 SRC SDWA. SRC0: register v[%d], "
-                    "DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, "
-                    "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
-                    "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
-                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
-                    extData.iFmt_VOP_SDWA.DST_U,
-                    extData.iFmt_VOP_SDWA.CLMP,
-                    extData.iFmt_VOP_SDWA.SRC0_SEL,
-                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC0_NEG,
-                    extData.iFmt_VOP_SDWA.SRC0_ABS,
-                    extData.iFmt_VOP_SDWA.SRC1_SEL,
-                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC1_NEG,
-                    extData.iFmt_VOP_SDWA.SRC1_ABS);
-
-            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
-                            src1, origSrc1);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0_sdwa[lane] | src1[lane];
-                    origVdst[lane] = vdst[lane]; // keep copy consistent
-                }
-            }
-
-            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0[lane] | src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_XOR_B32 class methods ---
-
-    Inst_VOP2__V_XOR_B32::Inst_VOP2__V_XOR_B32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_xor_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_XOR_B32
-
-    Inst_VOP2__V_XOR_B32::~Inst_VOP2__V_XOR_B32()
-    {
-    } // ~Inst_VOP2__V_XOR_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u ^ S1.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP2__V_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] ^ src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MAC_F32 class methods ---
-
-    Inst_VOP2__V_MAC_F32::Inst_VOP2__V_MAC_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mac_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(MAC);
-    } // Inst_VOP2__V_MAC_F32
-
-    Inst_VOP2__V_MAC_F32::~Inst_VOP2__V_MAC_F32()
-    {
-    } // ~Inst_VOP2__V_MAC_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f + D.f.
-    // SQ translates to V_MAD_F32.
-    void
-    Inst_VOP2__V_MAC_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        VecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-        vdst.read();
-
-        if (isDPPInst()) {
-            VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
-            src0_dpp.read();
-
-            DPRINTF(VEGA, "Handling V_MAC_F32 SRC DPP. SRC0: register v[%d], "
-                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
-                    "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
-                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
-                    extData.iFmt_VOP_DPP.DPP_CTRL,
-                    extData.iFmt_VOP_DPP.SRC0_ABS,
-                    extData.iFmt_VOP_DPP.SRC0_NEG,
-                    extData.iFmt_VOP_DPP.SRC1_ABS,
-                    extData.iFmt_VOP_DPP.SRC1_NEG,
-                    extData.iFmt_VOP_DPP.BC,
-                    extData.iFmt_VOP_DPP.BANK_MASK,
-                    extData.iFmt_VOP_DPP.ROW_MASK);
-
-            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = std::fma(src0_dpp[lane], src1[lane],
-                                          vdst[lane]);
-                }
-            }
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MADMK_F32 class methods ---
-
-    Inst_VOP2__V_MADMK_F32::Inst_VOP2__V_MADMK_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_madmk_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(MAD);
-    } // Inst_VOP2__V_MADMK_F32
-
-    Inst_VOP2__V_MADMK_F32::~Inst_VOP2__V_MADMK_F32()
-    {
-    } // ~Inst_VOP2__V_MADMK_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * K + S1.f; K is a 32-bit inline constant.
-    // This opcode cannot use the VOP3 encoding and cannot use input/output
-    // ---  modifiers.
-    // SQ translates to V_MAD_F32.
-    void
-    Inst_VOP2__V_MADMK_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-        VecElemF32 k = extData.imm_f32;
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], k, src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MADAK_F32 class methods ---
-
-    Inst_VOP2__V_MADAK_F32::Inst_VOP2__V_MADAK_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_madak_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(MAD);
-    } // Inst_VOP2__V_MADAK_F32
-
-    Inst_VOP2__V_MADAK_F32::~Inst_VOP2__V_MADAK_F32()
-    {
-    } // ~Inst_VOP2__V_MADAK_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f + K; K is a 32-bit inline constant.
-    // This opcode cannot use the VOP3 encoding and cannot use input/output
-    // ---  modifiers.
-    // SQ translates to V_MAD_F32.
-    void
-    Inst_VOP2__V_MADAK_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-        VecElemF32 k = extData.imm_f32;
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], k);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_ADD_CO_U32 class methods ---
-
-    Inst_VOP2__V_ADD_CO_U32::Inst_VOP2__V_ADD_CO_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_add_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-    } // Inst_VOP2__V_ADD_CO_U32
-
-    Inst_VOP2__V_ADD_CO_U32::~Inst_VOP2__V_ADD_CO_U32()
-    {
-    } // ~Inst_VOP2__V_ADD_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u + S1.u;
-    // VCC[threadId] = (S0.u + S1.u >= 0x800000000ULL ? 1 : 0) is an UNSIGNED
-    // ---  overflow or carry-out for V_ADDC_U32.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
-    void
-    Inst_VOP2__V_ADD_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        if (isSDWAInst()) {
-            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
-            // use copies of original src0, src1, and dest during selecting
-            VecOperandU32 origSrc0_sdwa(gpuDynInst,
-                                        extData.iFmt_VOP_SDWA.SRC0);
-            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
-            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
-
-            src0_sdwa.read();
-            origSrc0_sdwa.read();
-            origSrc1.read();
-
-            DPRINTF(VEGA, "Handling V_ADD_CO_U32 SRC SDWA. SRC0: register "
-                    "v[%d], DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, "
-                    "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
-                    "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
-                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
-                    extData.iFmt_VOP_SDWA.DST_U,
-                    extData.iFmt_VOP_SDWA.CLMP,
-                    extData.iFmt_VOP_SDWA.SRC0_SEL,
-                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC0_NEG,
-                    extData.iFmt_VOP_SDWA.SRC0_ABS,
-                    extData.iFmt_VOP_SDWA.SRC1_SEL,
-                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC1_NEG,
-                    extData.iFmt_VOP_SDWA.SRC1_ABS);
-
-            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
-                            src1, origSrc1);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0_sdwa[lane] + src1[lane];
-                    origVdst[lane] = vdst[lane]; // keep copy consistent
-                    vcc.setBit(lane, ((VecElemU64)src0_sdwa[lane]
-                        + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
-                }
-            }
-
-            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0[lane] + src1[lane];
-                    vcc.setBit(lane, ((VecElemU64)src0[lane]
-                        + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
-                }
-            }
-        }
-
-        vcc.write();
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_SUB_CO_U32 class methods ---
-
-    Inst_VOP2__V_SUB_CO_U32::Inst_VOP2__V_SUB_CO_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_sub_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-    } // Inst_VOP2__V_SUB_CO_U32
-
-    Inst_VOP2__V_SUB_CO_U32::~Inst_VOP2__V_SUB_CO_U32()
-    {
-    } // ~Inst_VOP2__V_SUB_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u - S1.u;
-    // VCC[threadId] = (S1.u > S0.u ? 1 : 0) is an UNSIGNED overflow or
-    // carry-out for V_SUBB_U32.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
-    void
-    Inst_VOP2__V_SUB_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-                vcc.setBit(lane, src1[lane] > src0[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    } // execute
-    // --- Inst_VOP2__V_SUBREV_CO_U32 class methods ---
-
-    Inst_VOP2__V_SUBREV_CO_U32::Inst_VOP2__V_SUBREV_CO_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subrev_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-    } // Inst_VOP2__V_SUBREV_CO_U32
-
-    Inst_VOP2__V_SUBREV_CO_U32::~Inst_VOP2__V_SUBREV_CO_U32()
-    {
-    } // ~Inst_VOP2__V_SUBREV_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S1.u - S0.u;
-    // VCC[threadId] = (S0.u > S1.u ? 1 : 0) is an UNSIGNED overflow or
-    // carry-out for V_SUBB_U32.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
-    void
-    Inst_VOP2__V_SUBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    } // execute
-    // --- Inst_VOP2__V_ADDC_CO_U32 class methods ---
-
-    Inst_VOP2__V_ADDC_CO_U32::Inst_VOP2__V_ADDC_CO_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_addc_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(ReadsVCC);
-    } // Inst_VOP2__V_ADDC_CO_U32
-
-    Inst_VOP2__V_ADDC_CO_U32::~Inst_VOP2__V_ADDC_CO_U32()
-    {
-    } // ~Inst_VOP2__V_ADDC_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u + S1.u + VCC[threadId];
-    // VCC[threadId] = (S0.u + S1.u + VCC[threadId] >= 0x800000000ULL ? 1 : 0)
-    // is an UNSIGNED overflow.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
-    // source comes from the SGPR-pair at S2.u.
-    void
-    Inst_VOP2__V_ADDC_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-        vcc.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane]
-                    + bits(vcc.rawData(), lane);
-                vcc.setBit(lane, ((VecElemU64)src0[lane]
-                    + (VecElemU64)src1[lane]
-                        + (VecElemU64)bits(vcc.rawData(), lane, lane))
-                            >= 0x100000000 ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    } // execute
-    // --- Inst_VOP2__V_SUBB_CO_U32 class methods ---
-
-    Inst_VOP2__V_SUBB_CO_U32::Inst_VOP2__V_SUBB_CO_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subb_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(ReadsVCC);
-    } // Inst_VOP2__V_SUBB_CO_U32
-
-    Inst_VOP2__V_SUBB_CO_U32::~Inst_VOP2__V_SUBB_CO_U32()
-    {
-    } // ~Inst_VOP2__V_SUBB_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u - S1.u - VCC[threadId];
-    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
-    // ---  overflow.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
-    // ---  source comes from the SGPR-pair at S2.u.
-    void
-    Inst_VOP2__V_SUBB_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-        vcc.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane]
-                    = src0[lane] - src1[lane] - bits(vcc.rawData(), lane);
-                vcc.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
-                    > src0[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    } // execute
-    // --- Inst_VOP2__V_SUBBREV_CO_U32 class methods ---
-
-    Inst_VOP2__V_SUBBREV_CO_U32::Inst_VOP2__V_SUBBREV_CO_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subbrev_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(ReadsVCC);
-    } // Inst_VOP2__V_SUBBREV_CO_U32
-
-    Inst_VOP2__V_SUBBREV_CO_U32::~Inst_VOP2__V_SUBBREV_CO_U32()
-    {
-    } // ~Inst_VOP2__V_SUBBREV_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S1.u - S0.u - VCC[threadId];
-    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
-    // overflow.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
-    // source comes from the SGPR-pair at S2.u. SQ translates to V_SUBB_U32.
-    // SQ translates this to V_SUBREV_U32 with reversed operands.
-    void
-    Inst_VOP2__V_SUBBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-        vcc.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane]
-                    = src1[lane] - src0[lane] - bits(vcc.rawData(), lane);
-                vcc.setBit(lane, (src0[lane] + bits(vcc.rawData(), lane))
-                    > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    } // execute
-    // --- Inst_VOP2__V_ADD_F16 class methods ---
-
-    Inst_VOP2__V_ADD_F16::Inst_VOP2__V_ADD_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_add_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP2__V_ADD_F16
-
-    Inst_VOP2__V_ADD_F16::~Inst_VOP2__V_ADD_F16()
-    {
-    } // ~Inst_VOP2__V_ADD_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 + S1.f16.
-    // Supports denormals, round mode, exception flags, saturation.
-    void
-    Inst_VOP2__V_ADD_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_SUB_F16 class methods ---
-
-    Inst_VOP2__V_SUB_F16::Inst_VOP2__V_SUB_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_sub_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP2__V_SUB_F16
-
-    Inst_VOP2__V_SUB_F16::~Inst_VOP2__V_SUB_F16()
-    {
-    } // ~Inst_VOP2__V_SUB_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 - S1.f16.
-    // Supports denormals, round mode, exception flags, saturation.
-    // SQ translates to V_ADD_F16.
-    void
-    Inst_VOP2__V_SUB_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_SUBREV_F16 class methods ---
-
-    Inst_VOP2__V_SUBREV_F16::Inst_VOP2__V_SUBREV_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subrev_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP2__V_SUBREV_F16
-
-    Inst_VOP2__V_SUBREV_F16::~Inst_VOP2__V_SUBREV_F16()
-    {
-    } // ~Inst_VOP2__V_SUBREV_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S1.f16 - S0.f16.
-    // Supports denormals, round mode, exception flags, saturation.
-    // SQ translates to V_ADD_F16.
-    void
-    Inst_VOP2__V_SUBREV_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_MUL_F16 class methods ---
-
-    Inst_VOP2__V_MUL_F16::Inst_VOP2__V_MUL_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP2__V_MUL_F16
-
-    Inst_VOP2__V_MUL_F16::~Inst_VOP2__V_MUL_F16()
-    {
-    } // ~Inst_VOP2__V_MUL_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * S1.f16.
-    // Supports denormals, round mode, exception flags, saturation.
-    void
-    Inst_VOP2__V_MUL_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_MAC_F16 class methods ---
-
-    Inst_VOP2__V_MAC_F16::Inst_VOP2__V_MAC_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mac_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(MAC);
-    } // Inst_VOP2__V_MAC_F16
-
-    Inst_VOP2__V_MAC_F16::~Inst_VOP2__V_MAC_F16()
-    {
-    } // ~Inst_VOP2__V_MAC_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * S1.f16 + D.f16.
-    // Supports round mode, exception flags, saturation.
-    // SQ translates this to V_MAD_F16.
-    void
-    Inst_VOP2__V_MAC_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_MADMK_F16 class methods ---
-
-    Inst_VOP2__V_MADMK_F16::Inst_VOP2__V_MADMK_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_madmk_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(MAD);
-    } // Inst_VOP2__V_MADMK_F16
-
-    Inst_VOP2__V_MADMK_F16::~Inst_VOP2__V_MADMK_F16()
-    {
-    } // ~Inst_VOP2__V_MADMK_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * K.f16 + S1.f16; K is a 16-bit inline constant stored
-    // in the following literal DWORD.
-    // This opcode cannot use the VOP3 encoding and cannot use input/output
-    // modifiers. Supports round mode, exception flags, saturation.
-    // SQ translates this to V_MAD_F16.
-    void
-    Inst_VOP2__V_MADMK_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_MADAK_F16 class methods ---
-
-    Inst_VOP2__V_MADAK_F16::Inst_VOP2__V_MADAK_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_madak_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(MAD);
-    } // Inst_VOP2__V_MADAK_F16
-
-    Inst_VOP2__V_MADAK_F16::~Inst_VOP2__V_MADAK_F16()
-    {
-    } // ~Inst_VOP2__V_MADAK_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * S1.f16 + K.f16; K is a 16-bit inline constant stored
-    // in the following literal DWORD.
-    // This opcode cannot use the VOP3 encoding and cannot use input/output
-    // modifiers. Supports round mode, exception flags, saturation.
-    // SQ translates this to V_MAD_F16.
-    void
-    Inst_VOP2__V_MADAK_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_ADD_U16 class methods ---
-
-    Inst_VOP2__V_ADD_U16::Inst_VOP2__V_ADD_U16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_add_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_ADD_U16
-
-    Inst_VOP2__V_ADD_U16::~Inst_VOP2__V_ADD_U16()
-    {
-    } // ~Inst_VOP2__V_ADD_U16
-
-    // --- description from .arch file ---
-    // D.u16 = S0.u16 + S1.u16.
-    // Supports saturation (unsigned 16-bit integer domain).
-    void
-    Inst_VOP2__V_ADD_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_SUB_U16 class methods ---
-
-    Inst_VOP2__V_SUB_U16::Inst_VOP2__V_SUB_U16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_sub_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_SUB_U16
-
-    Inst_VOP2__V_SUB_U16::~Inst_VOP2__V_SUB_U16()
-    {
-    } // ~Inst_VOP2__V_SUB_U16
-
-    // --- description from .arch file ---
-    // D.u16 = S0.u16 - S1.u16.
-    // Supports saturation (unsigned 16-bit integer domain).
-    void
-    Inst_VOP2__V_SUB_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_SUBREV_U16 class methods ---
-
-    Inst_VOP2__V_SUBREV_U16::Inst_VOP2__V_SUBREV_U16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subrev_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_SUBREV_U16
-
-    Inst_VOP2__V_SUBREV_U16::~Inst_VOP2__V_SUBREV_U16()
-    {
-    } // ~Inst_VOP2__V_SUBREV_U16
-
-    // --- description from .arch file ---
-    // D.u16 = S1.u16 - S0.u16.
-    // Supports saturation (unsigned 16-bit integer domain).
-    // SQ translates this to V_SUB_U16 with reversed operands.
-    void
-    Inst_VOP2__V_SUBREV_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MUL_LO_U16 class methods ---
-
-    Inst_VOP2__V_MUL_LO_U16::Inst_VOP2__V_MUL_LO_U16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_lo_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MUL_LO_U16
-
-    Inst_VOP2__V_MUL_LO_U16::~Inst_VOP2__V_MUL_LO_U16()
-    {
-    } // ~Inst_VOP2__V_MUL_LO_U16
-
-    // --- description from .arch file ---
-    // D.u16 = S0.u16 * S1.u16.
-    // Supports saturation (unsigned 16-bit integer domain).
-    void
-    Inst_VOP2__V_MUL_LO_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] * src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_LSHLREV_B16 class methods ---
-
-    Inst_VOP2__V_LSHLREV_B16::Inst_VOP2__V_LSHLREV_B16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_lshlrev_b16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_LSHLREV_B16
-
-    Inst_VOP2__V_LSHLREV_B16::~Inst_VOP2__V_LSHLREV_B16()
-    {
-    } // ~Inst_VOP2__V_LSHLREV_B16
-
-    // --- description from .arch file ---
-    // D.u[15:0] = S1.u[15:0] << S0.u[3:0].
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP2__V_LSHLREV_B16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] << bits(src0[lane], 3, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_LSHRREV_B16 class methods ---
-
-    Inst_VOP2__V_LSHRREV_B16::Inst_VOP2__V_LSHRREV_B16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_lshrrev_b16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_LSHRREV_B16
-
-    Inst_VOP2__V_LSHRREV_B16::~Inst_VOP2__V_LSHRREV_B16()
-    {
-    } // ~Inst_VOP2__V_LSHRREV_B16
-
-    // --- description from .arch file ---
-    // D.u[15:0] = S1.u[15:0] >> S0.u[3:0].
-    // The vacated bits are set to zero.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP2__V_LSHRREV_B16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_ASHRREV_I16 class methods ---
-
-    Inst_VOP2__V_ASHRREV_I16::Inst_VOP2__V_ASHRREV_I16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_ashrrev_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_ASHRREV_I16
-
-    Inst_VOP2__V_ASHRREV_I16::~Inst_VOP2__V_ASHRREV_I16()
-    {
-    } // ~Inst_VOP2__V_ASHRREV_I16
-
-    // --- description from .arch file ---
-    // D.i[15:0] = signext(S1.i[15:0]) >> S0.i[3:0].
-    // The vacated bits are set to the sign bit of the input value.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP2__V_ASHRREV_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MAX_F16 class methods ---
-
-    Inst_VOP2__V_MAX_F16::Inst_VOP2__V_MAX_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_max_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP2__V_MAX_F16
-
-    Inst_VOP2__V_MAX_F16::~Inst_VOP2__V_MAX_F16()
-    {
-    } // ~Inst_VOP2__V_MAX_F16
-
-    // --- description from .arch file ---
-    // D.f16 = max(S0.f16, S1.f16).
-    // IEEE compliant. Supports denormals, round mode, exception flags,
-    // saturation.
-    void
-    Inst_VOP2__V_MAX_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_MIN_F16 class methods ---
-
-    Inst_VOP2__V_MIN_F16::Inst_VOP2__V_MIN_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_min_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP2__V_MIN_F16
-
-    Inst_VOP2__V_MIN_F16::~Inst_VOP2__V_MIN_F16()
-    {
-    } // ~Inst_VOP2__V_MIN_F16
-
-    // --- description from .arch file ---
-    // D.f16 = min(S0.f16, S1.f16).
-    // IEEE compliant. Supports denormals, round mode, exception flags,
-    // saturation.
-    void
-    Inst_VOP2__V_MIN_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_MAX_U16 class methods ---
-
-    Inst_VOP2__V_MAX_U16::Inst_VOP2__V_MAX_U16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_max_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MAX_U16
-
-    Inst_VOP2__V_MAX_U16::~Inst_VOP2__V_MAX_U16()
-    {
-    } // ~Inst_VOP2__V_MAX_U16
-
-    // --- description from .arch file ---
-    // D.u[15:0] = max(S0.u[15:0], S1.u[15:0]).
-    void
-    Inst_VOP2__V_MAX_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MAX_I16 class methods ---
-
-    Inst_VOP2__V_MAX_I16::Inst_VOP2__V_MAX_I16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_max_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MAX_I16
-
-    Inst_VOP2__V_MAX_I16::~Inst_VOP2__V_MAX_I16()
-    {
-    } // ~Inst_VOP2__V_MAX_I16
-
-    // --- description from .arch file ---
-    // D.i[15:0] = max(S0.i[15:0], S1.i[15:0]).
-    void
-    Inst_VOP2__V_MAX_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MIN_U16 class methods ---
-
-    Inst_VOP2__V_MIN_U16::Inst_VOP2__V_MIN_U16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_min_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MIN_U16
-
-    Inst_VOP2__V_MIN_U16::~Inst_VOP2__V_MIN_U16()
-    {
-    } // ~Inst_VOP2__V_MIN_U16
-
-    // --- description from .arch file ---
-    // D.u[15:0] = min(S0.u[15:0], S1.u[15:0]).
-    void
-    Inst_VOP2__V_MIN_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MIN_I16 class methods ---
-
-    Inst_VOP2__V_MIN_I16::Inst_VOP2__V_MIN_I16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_min_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MIN_I16
-
-    Inst_VOP2__V_MIN_I16::~Inst_VOP2__V_MIN_I16()
-    {
-    } // ~Inst_VOP2__V_MIN_I16
-
-    // --- description from .arch file ---
-    // D.i[15:0] = min(S0.i[15:0], S1.i[15:0]).
-    void
-    Inst_VOP2__V_MIN_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_LDEXP_F16 class methods ---
-
-    Inst_VOP2__V_LDEXP_F16::Inst_VOP2__V_LDEXP_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_ldexp_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP2__V_LDEXP_F16
-
-    Inst_VOP2__V_LDEXP_F16::~Inst_VOP2__V_LDEXP_F16()
-    {
-    } // ~Inst_VOP2__V_LDEXP_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * (2 ** S1.i16).
-    void
-    Inst_VOP2__V_LDEXP_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_ADD_U32 class methods ---
-
-    Inst_VOP2__V_ADD_U32::Inst_VOP2__V_ADD_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_add_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_ADD_U32
-
-    Inst_VOP2__V_ADD_U32::~Inst_VOP2__V_ADD_U32()
-    {
-    } // ~Inst_VOP2__V_ADD_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u + S1.u;
-    void
-    Inst_VOP2__V_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        if (isSDWAInst()) {
-            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
-            // use copies of original src0, src1, and dest during selecting
-            VecOperandU32 origSrc0_sdwa(gpuDynInst,
-                                        extData.iFmt_VOP_SDWA.SRC0);
-            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
-            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
-
-            src0_sdwa.read();
-            origSrc0_sdwa.read();
-            origSrc1.read();
-
-            DPRINTF(VEGA, "Handling V_ADD_U32 SRC SDWA. SRC0: register v[%d], "
-                    "DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, "
-                    "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
-                    "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
-                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
-                    extData.iFmt_VOP_SDWA.DST_U,
-                    extData.iFmt_VOP_SDWA.CLMP,
-                    extData.iFmt_VOP_SDWA.SRC0_SEL,
-                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC0_NEG,
-                    extData.iFmt_VOP_SDWA.SRC0_ABS,
-                    extData.iFmt_VOP_SDWA.SRC1_SEL,
-                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC1_NEG,
-                    extData.iFmt_VOP_SDWA.SRC1_ABS);
-
-            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
-                            src1, origSrc1);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0_sdwa[lane] + src1[lane];
-                    origVdst[lane] = vdst[lane]; // keep copy consistent
-                }
-            }
-
-            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0[lane] + src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_SUB_U32 class methods ---
-
-    Inst_VOP2__V_SUB_U32::Inst_VOP2__V_SUB_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_sub_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_SUB_U32
-
-    Inst_VOP2__V_SUB_U32::~Inst_VOP2__V_SUB_U32()
-    {
-    } // ~Inst_VOP2__V_SUB_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u - S1.u;
-    void
-    Inst_VOP2__V_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_SUBREV_U32 class methods ---
-
-    Inst_VOP2__V_SUBREV_U32::Inst_VOP2__V_SUBREV_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subrev_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_SUBREV_U32
-
-    Inst_VOP2__V_SUBREV_U32::~Inst_VOP2__V_SUBREV_U32()
-    {
-    } // ~Inst_VOP2__V_SUBREV_U32
-
-    // --- description from .arch file ---
-    // D.u = S1.u - S0.u;
-    void
-    Inst_VOP2__V_SUBREV_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_FMAC_F32 class methods ---
-
-    Inst_VOP2__V_FMAC_F32::Inst_VOP2__V_FMAC_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_fmac_f32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_FMAC_F32
-
-    Inst_VOP2__V_FMAC_F32::~Inst_VOP2__V_FMAC_F32()
-    {
-    } // ~Inst_VOP2__V_FMAC_F32
-
-    // --- description from .arch file ---
-    // D.u = S1.u - S0.u;
-    void
-    Inst_VOP2__V_FMAC_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-        vdst.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_NOP class methods ---
-
-    Inst_VOP1__V_NOP::Inst_VOP1__V_NOP(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_nop")
-    {
-        setFlag(Nop);
-        setFlag(ALU);
-    } // Inst_VOP1__V_NOP
-
-    Inst_VOP1__V_NOP::~Inst_VOP1__V_NOP()
-    {
-    } // ~Inst_VOP1__V_NOP
-
-    // --- description from .arch file ---
-    // Do nothing.
-    void
-    Inst_VOP1__V_NOP::execute(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_VOP1__V_MOV_B32 class methods ---
-
-    Inst_VOP1__V_MOV_B32::Inst_VOP1__V_MOV_B32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_mov_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_MOV_B32
-
-    Inst_VOP1__V_MOV_B32::~Inst_VOP1__V_MOV_B32()
-    {
-    } // ~Inst_VOP1__V_MOV_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u.
-    // Input and output modifiers not supported; this is an untyped operation.
-    void
-    Inst_VOP1__V_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (isDPPInst()) {
-            VecOperandU32 src_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
-            src_dpp.read();
-
-            DPRINTF(VEGA, "Handling V_MOV_B32 SRC DPP. SRC0: register v[%d], "
-                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
-                    "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
-                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
-                    extData.iFmt_VOP_DPP.DPP_CTRL,
-                    extData.iFmt_VOP_DPP.SRC0_ABS,
-                    extData.iFmt_VOP_DPP.SRC0_NEG,
-                    extData.iFmt_VOP_DPP.SRC1_ABS,
-                    extData.iFmt_VOP_DPP.SRC1_NEG,
-                    extData.iFmt_VOP_DPP.BC,
-                    extData.iFmt_VOP_DPP.BANK_MASK,
-                    extData.iFmt_VOP_DPP.ROW_MASK);
-
-            // NOTE: For VOP1, there is no SRC1, so make sure we're not trying
-            // to negate it or take the absolute value of it
-            assert(!extData.iFmt_VOP_DPP.SRC1_ABS);
-            assert(!extData.iFmt_VOP_DPP.SRC1_NEG);
-            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src_dpp);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src_dpp[lane];
-                }
-            }
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_READFIRSTLANE_B32 class methods ---
-
-    Inst_VOP1__V_READFIRSTLANE_B32::Inst_VOP1__V_READFIRSTLANE_B32(
-          InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_readfirstlane_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_READFIRSTLANE_B32
-
-    Inst_VOP1__V_READFIRSTLANE_B32::~Inst_VOP1__V_READFIRSTLANE_B32()
-    {
-    } // ~Inst_VOP1__V_READFIRSTLANE_B32
-
-    // --- description from .arch file ---
-    // Copy one VGPR value to one SGPR. D = SGPR destination, S0 = source data
-    // (VGPR# or M0 for lds direct access), Lane# = FindFirst1fromLSB(exec)
-    // (Lane# = 0 if exec is zero). Ignores exec mask for the access. SQ
-    // translates to V_READLANE_B32.
-    // Input and output modifiers not supported; this is an untyped operation.
-    void
-    Inst_VOP1__V_READFIRSTLANE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarRegI32 src_lane(0);
-        ScalarRegU64 exec_mask = wf->execMask().to_ullong();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (exec_mask) {
-            src_lane = findLsbSet(exec_mask);
-        }
-
-        sdst = src[src_lane];
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_I32_F64 class methods ---
-
-    Inst_VOP1__V_CVT_I32_F64::Inst_VOP1__V_CVT_I32_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_i32_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_CVT_I32_F64
-
-    Inst_VOP1__V_CVT_I32_F64::~Inst_VOP1__V_CVT_I32_F64()
-    {
-    } // ~Inst_VOP1__V_CVT_I32_F64
-
-    // --- description from .arch file ---
-    // D.i = (int)S0.d.
-    // Out-of-range floating point values (including infinity) saturate. NaN is
-    // ---  converted to 0.
-    void
-    Inst_VOP1__V_CVT_I32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane]) || exp > 30) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = INT_MIN;
-                    } else {
-                        vdst[lane] = INT_MAX;
-                    }
-                } else {
-                    vdst[lane] = (VecElemI32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F64_I32 class methods ---
-
-    Inst_VOP1__V_CVT_F64_I32::Inst_VOP1__V_CVT_F64_I32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f64_i32")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_CVT_F64_I32
-
-    Inst_VOP1__V_CVT_F64_I32::~Inst_VOP1__V_CVT_F64_I32()
-    {
-    } // ~Inst_VOP1__V_CVT_F64_I32
-
-    // --- description from .arch file ---
-    // D.d = (double)S0.i.
-    void
-    Inst_VOP1__V_CVT_F64_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF64)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F32_I32 class methods ---
-
-    Inst_VOP1__V_CVT_F32_I32::Inst_VOP1__V_CVT_F32_I32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_i32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F32_I32
-
-    Inst_VOP1__V_CVT_F32_I32::~Inst_VOP1__V_CVT_F32_I32()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_I32
-
-    // --- description from .arch file ---
-    // D.f = (float)S0.i.
-    void
-    Inst_VOP1__V_CVT_F32_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F32_U32 class methods ---
-
-    Inst_VOP1__V_CVT_F32_U32::Inst_VOP1__V_CVT_F32_U32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_u32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F32_U32
-
-    Inst_VOP1__V_CVT_F32_U32::~Inst_VOP1__V_CVT_F32_U32()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_U32
-
-    // --- description from .arch file ---
-    // D.f = (float)S0.u.
-    void
-    Inst_VOP1__V_CVT_F32_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_U32_F32 class methods ---
-
-    Inst_VOP1__V_CVT_U32_F32::Inst_VOP1__V_CVT_U32_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_u32_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_U32_F32
-
-    Inst_VOP1__V_CVT_U32_F32::~Inst_VOP1__V_CVT_U32_F32()
-    {
-    } // ~Inst_VOP1__V_CVT_U32_F32
-
-    // --- description from .arch file ---
-    // D.u = (unsigned)S0.f.
-    // Out-of-range floating point values (including infinity) saturate. NaN is
-    // ---  converted to 0.
-    void
-    Inst_VOP1__V_CVT_U32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane])) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = 0;
-                    } else {
-                        vdst[lane] = UINT_MAX;
-                    }
-                } else if (exp > 31) {
-                    vdst[lane] = UINT_MAX;
-                } else {
-                    vdst[lane] = (VecElemU32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_I32_F32 class methods ---
-
-    Inst_VOP1__V_CVT_I32_F32::Inst_VOP1__V_CVT_I32_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_i32_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_I32_F32
-
-    Inst_VOP1__V_CVT_I32_F32::~Inst_VOP1__V_CVT_I32_F32()
-    {
-    } // ~Inst_VOP1__V_CVT_I32_F32
-
-    // --- description from .arch file ---
-    // D.i = (int)S0.f.
-    // Out-of-range floating point values (including infinity) saturate. NaN is
-    // ---  converted to 0.
-    void
-    Inst_VOP1__V_CVT_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane]) || exp > 30) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = INT_MIN;
-                    } else {
-                        vdst[lane] = INT_MAX;
-                    }
-                } else {
-                    vdst[lane] = (VecElemI32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_MOV_FED_B32 class methods ---
-
-    Inst_VOP1__V_MOV_FED_B32::Inst_VOP1__V_MOV_FED_B32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_mov_fed_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_MOV_FED_B32
-
-    Inst_VOP1__V_MOV_FED_B32::~Inst_VOP1__V_MOV_FED_B32()
-    {
-    } // ~Inst_VOP1__V_MOV_FED_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u;
-    // Introduce EDC double error upon write to dest vgpr without causing an
-    // ---  exception.
-    // Input and output modifiers not supported; this is an untyped operation.
-    void
-    Inst_VOP1__V_MOV_FED_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F16_F32 class methods ---
-
-    Inst_VOP1__V_CVT_F16_F32::Inst_VOP1__V_CVT_F16_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f16_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F16_F32
-
-    Inst_VOP1__V_CVT_F16_F32::~Inst_VOP1__V_CVT_F16_F32()
-    {
-    } // ~Inst_VOP1__V_CVT_F16_F32
-
-    // --- description from .arch file ---
-    // D.f16 = flt32_to_flt16(S0.f).
-    // Supports input modifiers and creates FP16 denormals when appropriate.
-    void
-    Inst_VOP1__V_CVT_F16_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F32_F16 class methods ---
-
-    Inst_VOP1__V_CVT_F32_F16::Inst_VOP1__V_CVT_F32_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_f16")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F32_F16
-
-    Inst_VOP1__V_CVT_F32_F16::~Inst_VOP1__V_CVT_F32_F16()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_F16
-
-    // --- description from .arch file ---
-    // D.f = flt16_to_flt32(S0.f16).
-    // FP16 denormal inputs are always accepted.
-    void
-    Inst_VOP1__V_CVT_F32_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_CVT_RPI_I32_F32 class methods ---
-
-    Inst_VOP1__V_CVT_RPI_I32_F32::Inst_VOP1__V_CVT_RPI_I32_F32(
-          InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_rpi_i32_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_RPI_I32_F32
-
-    Inst_VOP1__V_CVT_RPI_I32_F32::~Inst_VOP1__V_CVT_RPI_I32_F32()
-    {
-    } // ~Inst_VOP1__V_CVT_RPI_I32_F32
-
-    // --- description from .arch file ---
-    // D.i = (int)floor(S0.f + 0.5).
-    void
-    Inst_VOP1__V_CVT_RPI_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemI32)std::floor(src[lane] + 0.5);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_FLR_I32_F32 class methods ---
-
-    Inst_VOP1__V_CVT_FLR_I32_F32::Inst_VOP1__V_CVT_FLR_I32_F32(
-          InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_flr_i32_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_FLR_I32_F32
-
-    Inst_VOP1__V_CVT_FLR_I32_F32::~Inst_VOP1__V_CVT_FLR_I32_F32()
-    {
-    } // ~Inst_VOP1__V_CVT_FLR_I32_F32
-
-    // --- description from .arch file ---
-    // D.i = (int)floor(S0.f).
-    void
-    Inst_VOP1__V_CVT_FLR_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemI32)std::floor(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_OFF_F32_I4 class methods ---
-
-    Inst_VOP1__V_CVT_OFF_F32_I4::Inst_VOP1__V_CVT_OFF_F32_I4(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_off_f32_i4")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_OFF_F32_I4
-
-    Inst_VOP1__V_CVT_OFF_F32_I4::~Inst_VOP1__V_CVT_OFF_F32_I4()
-    {
-    } // ~Inst_VOP1__V_CVT_OFF_F32_I4
-
-    // --- description from .arch file ---
-    // 4-bit signed int to 32-bit float. Used for interpolation in shader.
-    void
-    Inst_VOP1__V_CVT_OFF_F32_I4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        // Could not parse sq_uc.arch desc field
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F32_F64 class methods ---
-
-    Inst_VOP1__V_CVT_F32_F64::Inst_VOP1__V_CVT_F32_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_CVT_F32_F64
-
-    Inst_VOP1__V_CVT_F32_F64::~Inst_VOP1__V_CVT_F32_F64()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_F64
-
-    // --- description from .arch file ---
-    // D.f = (float)S0.d.
-    void
-    Inst_VOP1__V_CVT_F32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F64_F32 class methods ---
-
-    Inst_VOP1__V_CVT_F64_F32::Inst_VOP1__V_CVT_F64_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f64_f32")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_CVT_F64_F32
-
-    Inst_VOP1__V_CVT_F64_F32::~Inst_VOP1__V_CVT_F64_F32()
-    {
-    } // ~Inst_VOP1__V_CVT_F64_F32
-
-    // --- description from .arch file ---
-    // D.d = (double)S0.f.
-    void
-    Inst_VOP1__V_CVT_F64_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF64)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F32_UBYTE0 class methods ---
-
-    Inst_VOP1__V_CVT_F32_UBYTE0::Inst_VOP1__V_CVT_F32_UBYTE0(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte0")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F32_UBYTE0
-
-    Inst_VOP1__V_CVT_F32_UBYTE0::~Inst_VOP1__V_CVT_F32_UBYTE0()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_UBYTE0
-
-    // --- description from .arch file ---
-    // D.f = (float)(S0.u[7:0]).
-    void
-    Inst_VOP1__V_CVT_F32_UBYTE0::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)(bits(src[lane], 7, 0));
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F32_UBYTE1 class methods ---
-
-    Inst_VOP1__V_CVT_F32_UBYTE1::Inst_VOP1__V_CVT_F32_UBYTE1(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte1")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F32_UBYTE1
-
-    Inst_VOP1__V_CVT_F32_UBYTE1::~Inst_VOP1__V_CVT_F32_UBYTE1()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_UBYTE1
-
-    // --- description from .arch file ---
-    // D.f = (float)(S0.u[15:8]).
-    void
-    Inst_VOP1__V_CVT_F32_UBYTE1::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)(bits(src[lane], 15, 8));
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F32_UBYTE2 class methods ---
-
-    Inst_VOP1__V_CVT_F32_UBYTE2::Inst_VOP1__V_CVT_F32_UBYTE2(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte2")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F32_UBYTE2
-
-    Inst_VOP1__V_CVT_F32_UBYTE2::~Inst_VOP1__V_CVT_F32_UBYTE2()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_UBYTE2
-
-    // --- description from .arch file ---
-    // D.f = (float)(S0.u[23:16]).
-    void
-    Inst_VOP1__V_CVT_F32_UBYTE2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)(bits(src[lane], 23, 16));
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F32_UBYTE3 class methods ---
-
-    Inst_VOP1__V_CVT_F32_UBYTE3::Inst_VOP1__V_CVT_F32_UBYTE3(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte3")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F32_UBYTE3
-
-    Inst_VOP1__V_CVT_F32_UBYTE3::~Inst_VOP1__V_CVT_F32_UBYTE3()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_UBYTE3
-
-    // --- description from .arch file ---
-    // D.f = (float)(S0.u[31:24]).
-    void
-    Inst_VOP1__V_CVT_F32_UBYTE3::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)(bits(src[lane], 31, 24));
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_U32_F64 class methods ---
-
-    Inst_VOP1__V_CVT_U32_F64::Inst_VOP1__V_CVT_U32_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_u32_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_CVT_U32_F64
-
-    Inst_VOP1__V_CVT_U32_F64::~Inst_VOP1__V_CVT_U32_F64()
-    {
-    } // ~Inst_VOP1__V_CVT_U32_F64
-
-    // --- description from .arch file ---
-    // D.u = (unsigned)S0.d.
-    // Out-of-range floating point values (including infinity) saturate. NaN is
-    // ---  converted to 0.
-    void
-    Inst_VOP1__V_CVT_U32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane])) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = 0;
-                    } else {
-                        vdst[lane] = UINT_MAX;
-                    }
-                } else if (exp > 31) {
-                    vdst[lane] = UINT_MAX;
-                } else {
-                    vdst[lane] = (VecElemU32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F64_U32 class methods ---
-
-    Inst_VOP1__V_CVT_F64_U32::Inst_VOP1__V_CVT_F64_U32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f64_u32")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_CVT_F64_U32
-
-    Inst_VOP1__V_CVT_F64_U32::~Inst_VOP1__V_CVT_F64_U32()
-    {
-    } // ~Inst_VOP1__V_CVT_F64_U32
-
-    // --- description from .arch file ---
-    // D.d = (double)S0.u.
-    void
-    Inst_VOP1__V_CVT_F64_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF64)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_TRUNC_F64 class methods ---
-
-    Inst_VOP1__V_TRUNC_F64::Inst_VOP1__V_TRUNC_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_trunc_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_TRUNC_F64
-
-    Inst_VOP1__V_TRUNC_F64::~Inst_VOP1__V_TRUNC_F64()
-    {
-    } // ~Inst_VOP1__V_TRUNC_F64
-
-    // --- description from .arch file ---
-    // D.d = trunc(S0.d), return integer part of S0.d.
-    void
-    Inst_VOP1__V_TRUNC_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::trunc(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CEIL_F64 class methods ---
-
-    Inst_VOP1__V_CEIL_F64::Inst_VOP1__V_CEIL_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_ceil_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_CEIL_F64
-
-    Inst_VOP1__V_CEIL_F64::~Inst_VOP1__V_CEIL_F64()
-    {
-    } // ~Inst_VOP1__V_CEIL_F64
-
-    // --- description from .arch file ---
-    // D.d = trunc(S0.d);
-    // if(S0.d > 0.0 && S0.d != D.d) then D.d += 1.0.
-    void
-    Inst_VOP1__V_CEIL_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::ceil(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_RNDNE_F64 class methods ---
-
-    Inst_VOP1__V_RNDNE_F64::Inst_VOP1__V_RNDNE_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rndne_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_RNDNE_F64
-
-    Inst_VOP1__V_RNDNE_F64::~Inst_VOP1__V_RNDNE_F64()
-    {
-    } // ~Inst_VOP1__V_RNDNE_F64
-
-    // --- description from .arch file ---
-    // D.d = round_nearest_even(S0.d).
-    void
-    Inst_VOP1__V_RNDNE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = roundNearestEven(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FLOOR_F64 class methods ---
-
-    Inst_VOP1__V_FLOOR_F64::Inst_VOP1__V_FLOOR_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_floor_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_FLOOR_F64
-
-    Inst_VOP1__V_FLOOR_F64::~Inst_VOP1__V_FLOOR_F64()
-    {
-    } // ~Inst_VOP1__V_FLOOR_F64
-
-    // --- description from .arch file ---
-    // D.d = trunc(S0.d);
-    // if(S0.d < 0.0 && S0.d != D.d) then D.d += -1.0.
-    void
-    Inst_VOP1__V_FLOOR_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::floor(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FRACT_F32 class methods ---
-
-    Inst_VOP1__V_FRACT_F32::Inst_VOP1__V_FRACT_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_fract_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_FRACT_F32
-
-    Inst_VOP1__V_FRACT_F32::~Inst_VOP1__V_FRACT_F32()
-    {
-    } // ~Inst_VOP1__V_FRACT_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f - floor(S0.f).
-    void
-    Inst_VOP1__V_FRACT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemF32 int_part(0.0);
-                vdst[lane] = std::modf(src[lane], &int_part);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_TRUNC_F32 class methods ---
-
-    Inst_VOP1__V_TRUNC_F32::Inst_VOP1__V_TRUNC_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_trunc_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_TRUNC_F32
-
-    Inst_VOP1__V_TRUNC_F32::~Inst_VOP1__V_TRUNC_F32()
-    {
-    } // ~Inst_VOP1__V_TRUNC_F32
-
-    // --- description from .arch file ---
-    // D.f = trunc(S0.f), return integer part of S0.f.
-    void
-    Inst_VOP1__V_TRUNC_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst (gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::trunc(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CEIL_F32 class methods ---
-
-    Inst_VOP1__V_CEIL_F32::Inst_VOP1__V_CEIL_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_ceil_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CEIL_F32
-
-    Inst_VOP1__V_CEIL_F32::~Inst_VOP1__V_CEIL_F32()
-    {
-    } // ~Inst_VOP1__V_CEIL_F32
-
-    // --- description from .arch file ---
-    // D.f = trunc(S0.f);
-    // if(S0.f > 0.0 && S0.f != D.f) then D.f += 1.0.
-    void
-    Inst_VOP1__V_CEIL_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::ceil(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_RNDNE_F32 class methods ---
-
-    Inst_VOP1__V_RNDNE_F32::Inst_VOP1__V_RNDNE_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rndne_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_RNDNE_F32
-
-    Inst_VOP1__V_RNDNE_F32::~Inst_VOP1__V_RNDNE_F32()
-    {
-    } // ~Inst_VOP1__V_RNDNE_F32
-
-    // --- description from .arch file ---
-    // D.f = round_nearest_even(S0.f).
-    void
-    Inst_VOP1__V_RNDNE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = roundNearestEven(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FLOOR_F32 class methods ---
-
-    Inst_VOP1__V_FLOOR_F32::Inst_VOP1__V_FLOOR_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_floor_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_FLOOR_F32
-
-    Inst_VOP1__V_FLOOR_F32::~Inst_VOP1__V_FLOOR_F32()
-    {
-    } // ~Inst_VOP1__V_FLOOR_F32
-
-    // --- description from .arch file ---
-    // D.f = trunc(S0.f);
-    // if(S0.f < 0.0 && S0.f != D.f) then D.f += -1.0.
-    void
-    Inst_VOP1__V_FLOOR_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::floor(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_EXP_F32 class methods ---
-
-    Inst_VOP1__V_EXP_F32::Inst_VOP1__V_EXP_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_exp_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_EXP_F32
-
-    Inst_VOP1__V_EXP_F32::~Inst_VOP1__V_EXP_F32()
-    {
-    } // ~Inst_VOP1__V_EXP_F32
-
-    // --- description from .arch file ---
-    // D.f = pow(2.0, S0.f).
-    void
-    Inst_VOP1__V_EXP_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::pow(2.0, src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_LOG_F32 class methods ---
-
-    Inst_VOP1__V_LOG_F32::Inst_VOP1__V_LOG_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_log_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_LOG_F32
-
-    Inst_VOP1__V_LOG_F32::~Inst_VOP1__V_LOG_F32()
-    {
-    } // ~Inst_VOP1__V_LOG_F32
-
-    // --- description from .arch file ---
-    // D.f = log2(S0.f). Base 2 logarithm.
-    void
-    Inst_VOP1__V_LOG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::log2(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_RCP_F32 class methods ---
-
-    Inst_VOP1__V_RCP_F32::Inst_VOP1__V_RCP_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rcp_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_RCP_F32
-
-    Inst_VOP1__V_RCP_F32::~Inst_VOP1__V_RCP_F32()
-    {
-    } // ~Inst_VOP1__V_RCP_F32
-
-    // --- description from .arch file ---
-    // D.f = 1.0 / S0.f. Reciprocal with IEEE rules and < 1ulp error.
-    void
-    Inst_VOP1__V_RCP_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = 1.0 / src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_RCP_IFLAG_F32 class methods ---
-
-    Inst_VOP1__V_RCP_IFLAG_F32::Inst_VOP1__V_RCP_IFLAG_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rcp_iflag_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_RCP_IFLAG_F32
-
-    Inst_VOP1__V_RCP_IFLAG_F32::~Inst_VOP1__V_RCP_IFLAG_F32()
-    {
-    } // ~Inst_VOP1__V_RCP_IFLAG_F32
-
-    // --- description from .arch file ---
-    // D.f = 1.0 / S0.f. Reciprocal intended for integer division, can raise
-    // ---  integer DIV_BY_ZERO exception but cannot raise floating-point
-    // ---  exceptions.
-    void
-    Inst_VOP1__V_RCP_IFLAG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = 1.0 / src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_RSQ_F32 class methods ---
-
-    Inst_VOP1__V_RSQ_F32::Inst_VOP1__V_RSQ_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rsq_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_RSQ_F32
-
-    Inst_VOP1__V_RSQ_F32::~Inst_VOP1__V_RSQ_F32()
-    {
-    } // ~Inst_VOP1__V_RSQ_F32
-
-    // --- description from .arch file ---
-    // D.f = 1.0 / sqrt(S0.f). Reciprocal square root with IEEE rules.
-    void
-    Inst_VOP1__V_RSQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = 1.0 / std::sqrt(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_RCP_F64 class methods ---
-
-    Inst_VOP1__V_RCP_F64::Inst_VOP1__V_RCP_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rcp_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_RCP_F64
-
-    Inst_VOP1__V_RCP_F64::~Inst_VOP1__V_RCP_F64()
-    {
-    } // ~Inst_VOP1__V_RCP_F64
-
-    // --- description from .arch file ---
-    // D.d = 1.0 / S0.d.
-    void
-    Inst_VOP1__V_RCP_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::fpclassify(src[lane]) == FP_ZERO) {
-                    vdst[lane] = +INFINITY;
-                } else if (std::isnan(src[lane])) {
-                    vdst[lane] = NAN;
-                } else if (std::isinf(src[lane])) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = -0.0;
-                    } else {
-                        vdst[lane] = 0.0;
-                    }
-                } else {
-                    vdst[lane] = 1.0 / src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_RSQ_F64 class methods ---
-
-    Inst_VOP1__V_RSQ_F64::Inst_VOP1__V_RSQ_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rsq_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_RSQ_F64
-
-    Inst_VOP1__V_RSQ_F64::~Inst_VOP1__V_RSQ_F64()
-    {
-    } // ~Inst_VOP1__V_RSQ_F64
-
-    // --- description from .arch file ---
-    // D.d = 1.0 / sqrt(S0.d). See V_RSQ_F32.
-    void
-    Inst_VOP1__V_RSQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::fpclassify(src[lane]) == FP_ZERO) {
-                    vdst[lane] = +INFINITY;
-                } else if (std::isnan(src[lane])) {
-                    vdst[lane] = NAN;
-                } else if (std::isinf(src[lane])
-                           && !std::signbit(src[lane])) {
-                    vdst[lane] = 0.0;
-                } else if (std::signbit(src[lane])) {
-                    vdst[lane] = NAN;
-                } else {
-                    vdst[lane] = 1.0 / std::sqrt(src[lane]);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_SQRT_F32 class methods ---
-
-    Inst_VOP1__V_SQRT_F32::Inst_VOP1__V_SQRT_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_sqrt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_SQRT_F32
-
-    Inst_VOP1__V_SQRT_F32::~Inst_VOP1__V_SQRT_F32()
-    {
-    } // ~Inst_VOP1__V_SQRT_F32
-
-    // --- description from .arch file ---
-    // D.f = sqrt(S0.f).
-    void
-    Inst_VOP1__V_SQRT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::sqrt(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_SQRT_F64 class methods ---
-
-    Inst_VOP1__V_SQRT_F64::Inst_VOP1__V_SQRT_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_sqrt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_SQRT_F64
-
-    Inst_VOP1__V_SQRT_F64::~Inst_VOP1__V_SQRT_F64()
-    {
-    } // ~Inst_VOP1__V_SQRT_F64
-
-    // --- description from .arch file ---
-    // D.d = sqrt(S0.d).
-    void
-    Inst_VOP1__V_SQRT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::sqrt(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_SIN_F32 class methods ---
-
-    Inst_VOP1__V_SIN_F32::Inst_VOP1__V_SIN_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_sin_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_SIN_F32
-
-    Inst_VOP1__V_SIN_F32::~Inst_VOP1__V_SIN_F32()
-    {
-    } // ~Inst_VOP1__V_SIN_F32
-
-    // --- description from .arch file ---
-    // D.f = sin(S0.f * 2 * PI).
-    // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
-    // float 0.0.
-    void
-    Inst_VOP1__V_SIN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-        pi.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (src[lane] < -256.0 || src[lane] > 256.0) {
-                    vdst[lane] = 0.0;
-                } else {
-                    vdst[lane] = std::sin(src[lane] * 2.0 * pi.rawData());
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_COS_F32 class methods ---
-
-    Inst_VOP1__V_COS_F32::Inst_VOP1__V_COS_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cos_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_COS_F32
-
-    Inst_VOP1__V_COS_F32::~Inst_VOP1__V_COS_F32()
-    {
-    } // ~Inst_VOP1__V_COS_F32
-
-    // --- description from .arch file ---
-    // D.f = cos(S0.f * 2 * PI).
-    // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
-    // float 1.0.
-    void
-    Inst_VOP1__V_COS_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-        pi.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (src[lane] < -256.0 || src[lane] > 256.0) {
-                    vdst[lane] = 0.0;
-                } else {
-                    vdst[lane] = std::cos(src[lane] * 2.0 * pi.rawData());
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_NOT_B32 class methods ---
-
-    Inst_VOP1__V_NOT_B32::Inst_VOP1__V_NOT_B32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_not_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_NOT_B32
-
-    Inst_VOP1__V_NOT_B32::~Inst_VOP1__V_NOT_B32()
-    {
-    } // ~Inst_VOP1__V_NOT_B32
-
-    // --- description from .arch file ---
-    // D.u = ~S0.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP1__V_NOT_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = ~src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_BFREV_B32 class methods ---
-
-    Inst_VOP1__V_BFREV_B32::Inst_VOP1__V_BFREV_B32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_bfrev_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_BFREV_B32
-
-    Inst_VOP1__V_BFREV_B32::~Inst_VOP1__V_BFREV_B32()
-    {
-    } // ~Inst_VOP1__V_BFREV_B32
-
-    // --- description from .arch file ---
-    // D.u[31:0] = S0.u[0:31], bitfield reverse.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP1__V_BFREV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = reverseBits(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FFBH_U32 class methods ---
-
-    Inst_VOP1__V_FFBH_U32::Inst_VOP1__V_FFBH_U32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_ffbh_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_FFBH_U32
-
-    Inst_VOP1__V_FFBH_U32::~Inst_VOP1__V_FFBH_U32()
-    {
-    } // ~Inst_VOP1__V_FFBH_U32
-
-    // --- description from .arch file ---
-    // D.u = position of first 1 in S0.u from MSB;
-    // D.u = 0xffffffff if S0.u == 0.
-    void
-    Inst_VOP1__V_FFBH_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = findFirstOneMsb(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FFBL_B32 class methods ---
-
-    Inst_VOP1__V_FFBL_B32::Inst_VOP1__V_FFBL_B32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_ffbl_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_FFBL_B32
-
-    Inst_VOP1__V_FFBL_B32::~Inst_VOP1__V_FFBL_B32()
-    {
-    } // ~Inst_VOP1__V_FFBL_B32
-
-    // --- description from .arch file ---
-    // D.u = position of first 1 in S0.u from LSB;
-    // D.u = 0xffffffff if S0.u == 0.
-    void
-    Inst_VOP1__V_FFBL_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = findFirstOne(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FFBH_I32 class methods ---
-
-    Inst_VOP1__V_FFBH_I32::Inst_VOP1__V_FFBH_I32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_ffbh_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_FFBH_I32
-
-    Inst_VOP1__V_FFBH_I32::~Inst_VOP1__V_FFBH_I32()
-    {
-    } // ~Inst_VOP1__V_FFBH_I32
-
-    // --- description from .arch file ---
-    // D.u = position of first bit different from sign bit in S0.i from MSB;
-    // D.u = 0xffffffff if S0.i == 0 or S0.i == 0xffffffff.
-    void
-    Inst_VOP1__V_FFBH_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = firstOppositeSignBit(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FREXP_EXP_I32_F64 class methods ---
-
-    Inst_VOP1__V_FREXP_EXP_I32_F64::Inst_VOP1__V_FREXP_EXP_I32_F64(
-          InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_frexp_exp_i32_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_FREXP_EXP_I32_F64
-
-    Inst_VOP1__V_FREXP_EXP_I32_F64::~Inst_VOP1__V_FREXP_EXP_I32_F64()
-    {
-    } // ~Inst_VOP1__V_FREXP_EXP_I32_F64
-
-    // --- description from .arch file ---
-    // See V_FREXP_EXP_I32_F32.
-    void
-    Inst_VOP1__V_FREXP_EXP_I32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else {
-                    VecElemI32 exp = 0;
-                    std::frexp(src[lane], &exp);
-                    vdst[lane] = exp;
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FREXP_MANT_F64 class methods ---
-
-    Inst_VOP1__V_FREXP_MANT_F64::Inst_VOP1__V_FREXP_MANT_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_frexp_mant_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_FREXP_MANT_F64
-
-    Inst_VOP1__V_FREXP_MANT_F64::~Inst_VOP1__V_FREXP_MANT_F64()
-    {
-    } // ~Inst_VOP1__V_FREXP_MANT_F64
-
-    // --- description from .arch file ---
-    // See V_FREXP_MANT_F32.
-    void
-    Inst_VOP1__V_FREXP_MANT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
-                    vdst[lane] = src[lane];
-                } else {
-                    VecElemI32 exp(0);
-                    vdst[lane] = std::frexp(src[lane], &exp);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FRACT_F64 class methods ---
-
-    Inst_VOP1__V_FRACT_F64::Inst_VOP1__V_FRACT_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_fract_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_FRACT_F64
-
-    Inst_VOP1__V_FRACT_F64::~Inst_VOP1__V_FRACT_F64()
-    {
-    } // ~Inst_VOP1__V_FRACT_F64
-
-    // --- description from .arch file ---
-    // See V_FRACT_F32.
-    void
-    Inst_VOP1__V_FRACT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemF64 int_part(0.0);
-                vdst[lane] = std::modf(src[lane], &int_part);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FREXP_EXP_I32_F32 class methods ---
-
-    Inst_VOP1__V_FREXP_EXP_I32_F32::Inst_VOP1__V_FREXP_EXP_I32_F32(
-          InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_frexp_exp_i32_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_FREXP_EXP_I32_F32
-
-    Inst_VOP1__V_FREXP_EXP_I32_F32::~Inst_VOP1__V_FREXP_EXP_I32_F32()
-    {
-    } // ~Inst_VOP1__V_FREXP_EXP_I32_F32
-
-    // --- description from .arch file ---
-    // if(S0.f == INF || S0.f == NAN) then D.i = 0;
-    // else D.i = TwosComplement(Exponent(S0.f) - 127 + 1).
-    // Returns exponent of single precision float input, such that S0.f =
-    // significand * (2 ** exponent). See also FREXP_MANT_F32, which returns
-    // the significand.
-    void
-    Inst_VOP1__V_FREXP_EXP_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else {
-                    VecElemI32 exp(0);
-                    std::frexp(src[lane], &exp);
-                    vdst[lane] = exp;
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FREXP_MANT_F32 class methods ---
-
-    Inst_VOP1__V_FREXP_MANT_F32::Inst_VOP1__V_FREXP_MANT_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_frexp_mant_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_FREXP_MANT_F32
-
-    Inst_VOP1__V_FREXP_MANT_F32::~Inst_VOP1__V_FREXP_MANT_F32()
-    {
-    } // ~Inst_VOP1__V_FREXP_MANT_F32
-
-    // --- description from .arch file ---
-    // if(S0.f == INF || S0.f == NAN) then D.f = S0.f;
-    // else D.f = Mantissa(S0.f).
-    // Result range is in (-1.0,-0.5][0.5,1.0) in normal cases. Returns binary
-    // ---  significand of single precision float input, such that S0.f =
-    // ---  significand * (2 ** exponent). See also FREXP_EXP_I32_F32, which
-    // ---  returns integer exponent.
-    void
-    Inst_VOP1__V_FREXP_MANT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
-                    vdst[lane] = src[lane];
-                } else {
-                    VecElemI32 exp(0);
-                    vdst[lane] = std::frexp(src[lane], &exp);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CLREXCP class methods ---
-
-    Inst_VOP1__V_CLREXCP::Inst_VOP1__V_CLREXCP(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_clrexcp")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_CLREXCP
-
-    Inst_VOP1__V_CLREXCP::~Inst_VOP1__V_CLREXCP()
-    {
-    } // ~Inst_VOP1__V_CLREXCP
-
-    // --- description from .arch file ---
-    // Clear wave's exception state in SIMD (SP).
-    void
-    Inst_VOP1__V_CLREXCP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F16_U16 class methods ---
-
-    Inst_VOP1__V_CVT_F16_U16::Inst_VOP1__V_CVT_F16_U16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f16_u16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_CVT_F16_U16
-
-    Inst_VOP1__V_CVT_F16_U16::~Inst_VOP1__V_CVT_F16_U16()
-    {
-    } // ~Inst_VOP1__V_CVT_F16_U16
-
-    // --- description from .arch file ---
-    // D.f16 = uint16_to_flt16(S.u16).
-    // Supports denormals, rounding, exception flags and saturation.
-    void
-    Inst_VOP1__V_CVT_F16_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F16_I16 class methods ---
-
-    Inst_VOP1__V_CVT_F16_I16::Inst_VOP1__V_CVT_F16_I16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f16_i16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_CVT_F16_I16
-
-    Inst_VOP1__V_CVT_F16_I16::~Inst_VOP1__V_CVT_F16_I16()
-    {
-    } // ~Inst_VOP1__V_CVT_F16_I16
-
-    // --- description from .arch file ---
-    // D.f16 = int16_to_flt16(S.i16).
-    // Supports denormals, rounding, exception flags and saturation.
-    void
-    Inst_VOP1__V_CVT_F16_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_CVT_U16_F16 class methods ---
-
-    Inst_VOP1__V_CVT_U16_F16::Inst_VOP1__V_CVT_U16_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_u16_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_CVT_U16_F16
-
-    Inst_VOP1__V_CVT_U16_F16::~Inst_VOP1__V_CVT_U16_F16()
-    {
-    } // ~Inst_VOP1__V_CVT_U16_F16
-
-    // --- description from .arch file ---
-    // D.u16 = flt16_to_uint16(S.f16).
-    // Supports rounding, exception flags and saturation.
-    void
-    Inst_VOP1__V_CVT_U16_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_CVT_I16_F16 class methods ---
-
-    Inst_VOP1__V_CVT_I16_F16::Inst_VOP1__V_CVT_I16_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_i16_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_CVT_I16_F16
-
-    Inst_VOP1__V_CVT_I16_F16::~Inst_VOP1__V_CVT_I16_F16()
-    {
-    } // ~Inst_VOP1__V_CVT_I16_F16
-
-    // --- description from .arch file ---
-    // D.i16 = flt16_to_int16(S.f16).
-    // Supports rounding, exception flags and saturation.
-    void
-    Inst_VOP1__V_CVT_I16_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_RCP_F16 class methods ---
-
-    Inst_VOP1__V_RCP_F16::Inst_VOP1__V_RCP_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rcp_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_RCP_F16
-
-    Inst_VOP1__V_RCP_F16::~Inst_VOP1__V_RCP_F16()
-    {
-    } // ~Inst_VOP1__V_RCP_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 1.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = ApproximateRecip(S0.f16).
-    void
-    Inst_VOP1__V_RCP_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_SQRT_F16 class methods ---
-
-    Inst_VOP1__V_SQRT_F16::Inst_VOP1__V_SQRT_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_sqrt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_SQRT_F16
-
-    Inst_VOP1__V_SQRT_F16::~Inst_VOP1__V_SQRT_F16()
-    {
-    } // ~Inst_VOP1__V_SQRT_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 1.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = ApproximateSqrt(S0.f16).
-    void
-    Inst_VOP1__V_SQRT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_RSQ_F16 class methods ---
-
-    Inst_VOP1__V_RSQ_F16::Inst_VOP1__V_RSQ_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rsq_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_RSQ_F16
-
-    Inst_VOP1__V_RSQ_F16::~Inst_VOP1__V_RSQ_F16()
-    {
-    } // ~Inst_VOP1__V_RSQ_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 1.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = ApproximateRecipSqrt(S0.f16).
-    void
-    Inst_VOP1__V_RSQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_LOG_F16 class methods ---
-
-    Inst_VOP1__V_LOG_F16::Inst_VOP1__V_LOG_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_log_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_LOG_F16
-
-    Inst_VOP1__V_LOG_F16::~Inst_VOP1__V_LOG_F16()
-    {
-    } // ~Inst_VOP1__V_LOG_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 1.0f)
-    //     D.f16 = 0.0f;
-    // else
-    //     D.f16 = ApproximateLog2(S0.f16).
-    void
-    Inst_VOP1__V_LOG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_EXP_F16 class methods ---
-
-    Inst_VOP1__V_EXP_F16::Inst_VOP1__V_EXP_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_exp_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_EXP_F16
-
-    Inst_VOP1__V_EXP_F16::~Inst_VOP1__V_EXP_F16()
-    {
-    } // ~Inst_VOP1__V_EXP_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 0.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = Approximate2ToX(S0.f16).
-    void
-    Inst_VOP1__V_EXP_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_FREXP_MANT_F16 class methods ---
-
-    Inst_VOP1__V_FREXP_MANT_F16::Inst_VOP1__V_FREXP_MANT_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_frexp_mant_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_FREXP_MANT_F16
-
-    Inst_VOP1__V_FREXP_MANT_F16::~Inst_VOP1__V_FREXP_MANT_F16()
-    {
-    } // ~Inst_VOP1__V_FREXP_MANT_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == +-INF || S0.f16 == NAN)
-    //     D.f16 = S0.f16;
-    // else
-    //     D.f16 = mantissa(S0.f16).
-    // Result range is (-1.0,-0.5][0.5,1.0).
-    // C math library frexp function.
-    // Returns binary significand of half precision float input, such that the
-    // original single float = significand * (2 ** exponent).
-    void
-    Inst_VOP1__V_FREXP_MANT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_FREXP_EXP_I16_F16 class methods ---
-
-    Inst_VOP1__V_FREXP_EXP_I16_F16::Inst_VOP1__V_FREXP_EXP_I16_F16(
-          InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_frexp_exp_i16_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_FREXP_EXP_I16_F16
-
-    Inst_VOP1__V_FREXP_EXP_I16_F16::~Inst_VOP1__V_FREXP_EXP_I16_F16()
-    {
-    } // ~Inst_VOP1__V_FREXP_EXP_I16_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == +-INF || S0.f16 == NAN)
-    //     D.i16 = 0;
-    // else
-    //     D.i16 = 2s_complement(exponent(S0.f16) - 15 + 1).
-    // C math library frexp function.
-    // Returns exponent of half precision float input, such that the
-    // original single float = significand * (2 ** exponent).
-    void
-    Inst_VOP1__V_FREXP_EXP_I16_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_FLOOR_F16 class methods ---
-
-    Inst_VOP1__V_FLOOR_F16::Inst_VOP1__V_FLOOR_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_floor_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_FLOOR_F16
-
-    Inst_VOP1__V_FLOOR_F16::~Inst_VOP1__V_FLOOR_F16()
-    {
-    } // ~Inst_VOP1__V_FLOOR_F16
-
-    // --- description from .arch file ---
-    // D.f16 = trunc(S0.f16);
-    // if(S0.f16 < 0.0f && S0.f16 != D.f16) then D.f16 -= 1.0f.
-    void
-    Inst_VOP1__V_FLOOR_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_CEIL_F16 class methods ---
-
-    Inst_VOP1__V_CEIL_F16::Inst_VOP1__V_CEIL_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_ceil_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_CEIL_F16
-
-    Inst_VOP1__V_CEIL_F16::~Inst_VOP1__V_CEIL_F16()
-    {
-    } // ~Inst_VOP1__V_CEIL_F16
-
-    // --- description from .arch file ---
-    // D.f16 = trunc(S0.f16);
-    // if(S0.f16 > 0.0f && S0.f16 != D.f16) then D.f16 += 1.0f.
-    void
-    Inst_VOP1__V_CEIL_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_TRUNC_F16 class methods ---
-
-    Inst_VOP1__V_TRUNC_F16::Inst_VOP1__V_TRUNC_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_trunc_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_TRUNC_F16
-
-    Inst_VOP1__V_TRUNC_F16::~Inst_VOP1__V_TRUNC_F16()
-    {
-    } // ~Inst_VOP1__V_TRUNC_F16
-
-    // --- description from .arch file ---
-    // D.f16 = trunc(S0.f16).
-    // Round-to-zero semantics.
-    void
-    Inst_VOP1__V_TRUNC_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_RNDNE_F16 class methods ---
-
-    Inst_VOP1__V_RNDNE_F16::Inst_VOP1__V_RNDNE_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rndne_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_RNDNE_F16
-
-    Inst_VOP1__V_RNDNE_F16::~Inst_VOP1__V_RNDNE_F16()
-    {
-    } // ~Inst_VOP1__V_RNDNE_F16
-
-    // --- description from .arch file ---
-    // D.f16 = FLOOR(S0.f16 + 0.5f);
-    // if(floor(S0.f16) is even && fract(S0.f16) == 0.5f) then D.f16 -= 1.0f.
-    // Round-to-nearest-even semantics.
-    void
-    Inst_VOP1__V_RNDNE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_FRACT_F16 class methods ---
-
-    Inst_VOP1__V_FRACT_F16::Inst_VOP1__V_FRACT_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_fract_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_FRACT_F16
-
-    Inst_VOP1__V_FRACT_F16::~Inst_VOP1__V_FRACT_F16()
-    {
-    } // ~Inst_VOP1__V_FRACT_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 + -floor(S0.f16).
-    void
-    Inst_VOP1__V_FRACT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_SIN_F16 class methods ---
-
-    Inst_VOP1__V_SIN_F16::Inst_VOP1__V_SIN_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_sin_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_SIN_F16
-
-    Inst_VOP1__V_SIN_F16::~Inst_VOP1__V_SIN_F16()
-    {
-    } // ~Inst_VOP1__V_SIN_F16
-
-    // --- description from .arch file ---
-    // D.f16 = sin(S0.f16 * 2 * PI).
-    void
-    Inst_VOP1__V_SIN_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_COS_F16 class methods ---
-
-    Inst_VOP1__V_COS_F16::Inst_VOP1__V_COS_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cos_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_COS_F16
-
-    Inst_VOP1__V_COS_F16::~Inst_VOP1__V_COS_F16()
-    {
-    } // ~Inst_VOP1__V_COS_F16
-
-    // --- description from .arch file ---
-    // D.f16 = cos(S0.f16 * 2 * PI).
-    void
-    Inst_VOP1__V_COS_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_EXP_LEGACY_F32 class methods ---
-
-    Inst_VOP1__V_EXP_LEGACY_F32::Inst_VOP1__V_EXP_LEGACY_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_exp_legacy_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_EXP_LEGACY_F32
-
-    Inst_VOP1__V_EXP_LEGACY_F32::~Inst_VOP1__V_EXP_LEGACY_F32()
-    {
-    } // ~Inst_VOP1__V_EXP_LEGACY_F32
-
-    // --- description from .arch file ---
-    // D.f = pow(2.0, S0.f) with legacy semantics.
-    void
-    Inst_VOP1__V_EXP_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::pow(2.0, src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_LOG_LEGACY_F32 class methods ---
-
-    Inst_VOP1__V_LOG_LEGACY_F32::Inst_VOP1__V_LOG_LEGACY_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_log_legacy_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_LOG_LEGACY_F32
-
-    Inst_VOP1__V_LOG_LEGACY_F32::~Inst_VOP1__V_LOG_LEGACY_F32()
-    {
-    } // ~Inst_VOP1__V_LOG_LEGACY_F32
-
-    // --- description from .arch file ---
-    // D.f = log2(S0.f). Base 2 logarithm with legacy semantics.
-    void
-    Inst_VOP1__V_LOG_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::log2(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_CLASS_F32 class methods ---
-
-    Inst_VOPC__V_CMP_CLASS_F32::Inst_VOPC__V_CMP_CLASS_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_class_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_CLASS_F32
-
-    Inst_VOPC__V_CMP_CLASS_F32::~Inst_VOPC__V_CMP_CLASS_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_CLASS_F32
-
-    // --- description from .arch file ---
-    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f
-    // The function reports true if the floating point value is *any* of the
-    // ---  numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOPC__V_CMP_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane]) && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_CLASS_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_CLASS_F32::Inst_VOPC__V_CMPX_CLASS_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_class_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_CLASS_F32
-
-    Inst_VOPC__V_CMPX_CLASS_F32::~Inst_VOPC__V_CMPX_CLASS_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_CLASS_F32
-
-    // --- description from .arch file ---
-    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
-    // S0.f The function reports true if the floating point value is *any* of
-    // the numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOPC__V_CMPX_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane]) && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMP_CLASS_F64 class methods ---
-
-    Inst_VOPC__V_CMP_CLASS_F64::Inst_VOPC__V_CMP_CLASS_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_class_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_CLASS_F64
-
-    Inst_VOPC__V_CMP_CLASS_F64::~Inst_VOPC__V_CMP_CLASS_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_CLASS_F64
-
-    // --- description from .arch file ---
-    // VCC = IEEE numeric class function specified in S1.u, performed on S0.d
-    // The function reports true if the floating point value is *any* of the
-    // ---  numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOPC__V_CMP_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_CLASS_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_CLASS_F64::Inst_VOPC__V_CMPX_CLASS_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_class_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_CLASS_F64
-
-    Inst_VOPC__V_CMPX_CLASS_F64::~Inst_VOPC__V_CMPX_CLASS_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_CLASS_F64
-
-    // --- description from .arch file ---
-    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
-    // S0.d The function reports true if the floating point value is *any* of
-    // the numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOPC__V_CMPX_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMP_CLASS_F16 class methods ---
-
-    Inst_VOPC__V_CMP_CLASS_F16::Inst_VOPC__V_CMP_CLASS_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_class_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_CLASS_F16
-
-    Inst_VOPC__V_CMP_CLASS_F16::~Inst_VOPC__V_CMP_CLASS_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_CLASS_F16
-
-    // --- description from .arch file ---
-    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f16
-    // The function reports true if the floating point value is *any* of the
-    // ---  numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOPC__V_CMP_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_CLASS_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_CLASS_F16::Inst_VOPC__V_CMPX_CLASS_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_class_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_CLASS_F16
-
-    Inst_VOPC__V_CMPX_CLASS_F16::~Inst_VOPC__V_CMPX_CLASS_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_CLASS_F16
-
-    // --- description from .arch file ---
-    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
-    // ---  S0.f16
-    // The function reports true if the floating point value is *any* of the
-    // ---  numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOPC__V_CMPX_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_F_F16 class methods ---
-
-    Inst_VOPC__V_CMP_F_F16::Inst_VOPC__V_CMP_F_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_F_F16
-
-    Inst_VOPC__V_CMP_F_F16::~Inst_VOPC__V_CMP_F_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_F_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LT_F16 class methods ---
-
-    Inst_VOPC__V_CMP_LT_F16::Inst_VOPC__V_CMP_LT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_LT_F16
-
-    Inst_VOPC__V_CMP_LT_F16::~Inst_VOPC__V_CMP_LT_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_EQ_F16 class methods ---
-
-    Inst_VOPC__V_CMP_EQ_F16::Inst_VOPC__V_CMP_EQ_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_EQ_F16
-
-    Inst_VOPC__V_CMP_EQ_F16::~Inst_VOPC__V_CMP_EQ_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LE_F16 class methods ---
-
-    Inst_VOPC__V_CMP_LE_F16::Inst_VOPC__V_CMP_LE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_LE_F16
-
-    Inst_VOPC__V_CMP_LE_F16::~Inst_VOPC__V_CMP_LE_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GT_F16 class methods ---
-
-    Inst_VOPC__V_CMP_GT_F16::Inst_VOPC__V_CMP_GT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_GT_F16
-
-    Inst_VOPC__V_CMP_GT_F16::~Inst_VOPC__V_CMP_GT_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LG_F16 class methods ---
-
-    Inst_VOPC__V_CMP_LG_F16::Inst_VOPC__V_CMP_LG_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lg_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_LG_F16
-
-    Inst_VOPC__V_CMP_LG_F16::~Inst_VOPC__V_CMP_LG_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_LG_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GE_F16 class methods ---
-
-    Inst_VOPC__V_CMP_GE_F16::Inst_VOPC__V_CMP_GE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_GE_F16
-
-    Inst_VOPC__V_CMP_GE_F16::~Inst_VOPC__V_CMP_GE_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_O_F16 class methods ---
-
-    Inst_VOPC__V_CMP_O_F16::Inst_VOPC__V_CMP_O_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_o_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_O_F16
-
-    Inst_VOPC__V_CMP_O_F16::~Inst_VOPC__V_CMP_O_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_O_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_O_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_U_F16 class methods ---
-
-    Inst_VOPC__V_CMP_U_F16::Inst_VOPC__V_CMP_U_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_u_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_U_F16
-
-    Inst_VOPC__V_CMP_U_F16::~Inst_VOPC__V_CMP_U_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_U_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_U_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NGE_F16 class methods ---
-
-    Inst_VOPC__V_CMP_NGE_F16::Inst_VOPC__V_CMP_NGE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nge_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_NGE_F16
-
-    Inst_VOPC__V_CMP_NGE_F16::~Inst_VOPC__V_CMP_NGE_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_NGE_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NLG_F16 class methods ---
-
-    Inst_VOPC__V_CMP_NLG_F16::Inst_VOPC__V_CMP_NLG_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nlg_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_NLG_F16
-
-    Inst_VOPC__V_CMP_NLG_F16::~Inst_VOPC__V_CMP_NLG_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_NLG_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NGT_F16 class methods ---
-
-    Inst_VOPC__V_CMP_NGT_F16::Inst_VOPC__V_CMP_NGT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ngt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_NGT_F16
-
-    Inst_VOPC__V_CMP_NGT_F16::~Inst_VOPC__V_CMP_NGT_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_NGT_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NLE_F16 class methods ---
-
-    Inst_VOPC__V_CMP_NLE_F16::Inst_VOPC__V_CMP_NLE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nle_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_NLE_F16
-
-    Inst_VOPC__V_CMP_NLE_F16::~Inst_VOPC__V_CMP_NLE_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_NLE_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NEQ_F16 class methods ---
-
-    Inst_VOPC__V_CMP_NEQ_F16::Inst_VOPC__V_CMP_NEQ_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_neq_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_NEQ_F16
-
-    Inst_VOPC__V_CMP_NEQ_F16::~Inst_VOPC__V_CMP_NEQ_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_NEQ_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NLT_F16 class methods ---
-
-    Inst_VOPC__V_CMP_NLT_F16::Inst_VOPC__V_CMP_NLT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nlt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_NLT_F16
-
-    Inst_VOPC__V_CMP_NLT_F16::~Inst_VOPC__V_CMP_NLT_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_NLT_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_TRU_F16 class methods ---
-
-    Inst_VOPC__V_CMP_TRU_F16::Inst_VOPC__V_CMP_TRU_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_tru_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_TRU_F16
-
-    Inst_VOPC__V_CMP_TRU_F16::~Inst_VOPC__V_CMP_TRU_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_TRU_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_F_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_F_F16::Inst_VOPC__V_CMPX_F_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_F_F16
-
-    Inst_VOPC__V_CMPX_F_F16::~Inst_VOPC__V_CMPX_F_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LT_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_LT_F16::Inst_VOPC__V_CMPX_LT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LT_F16
-
-    Inst_VOPC__V_CMPX_LT_F16::~Inst_VOPC__V_CMPX_LT_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_EQ_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_EQ_F16::Inst_VOPC__V_CMPX_EQ_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_EQ_F16
-
-    Inst_VOPC__V_CMPX_EQ_F16::~Inst_VOPC__V_CMPX_EQ_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LE_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_LE_F16::Inst_VOPC__V_CMPX_LE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LE_F16
-
-    Inst_VOPC__V_CMPX_LE_F16::~Inst_VOPC__V_CMPX_LE_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GT_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_GT_F16::Inst_VOPC__V_CMPX_GT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GT_F16
-
-    Inst_VOPC__V_CMPX_GT_F16::~Inst_VOPC__V_CMPX_GT_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LG_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_LG_F16::Inst_VOPC__V_CMPX_LG_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lg_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LG_F16
-
-    Inst_VOPC__V_CMPX_LG_F16::~Inst_VOPC__V_CMPX_LG_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_LG_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GE_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_GE_F16::Inst_VOPC__V_CMPX_GE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GE_F16
-
-    Inst_VOPC__V_CMPX_GE_F16::~Inst_VOPC__V_CMPX_GE_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_O_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_O_F16::Inst_VOPC__V_CMPX_O_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_o_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_O_F16
-
-    Inst_VOPC__V_CMPX_O_F16::~Inst_VOPC__V_CMPX_O_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_O_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOPC__V_CMPX_O_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_U_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_U_F16::Inst_VOPC__V_CMPX_U_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_u_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_U_F16
-
-    Inst_VOPC__V_CMPX_U_F16::~Inst_VOPC__V_CMPX_U_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_U_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOPC__V_CMPX_U_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NGE_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_NGE_F16::Inst_VOPC__V_CMPX_NGE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nge_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NGE_F16
-
-    Inst_VOPC__V_CMPX_NGE_F16::~Inst_VOPC__V_CMPX_NGE_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NGE_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NLG_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_NLG_F16::Inst_VOPC__V_CMPX_NLG_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nlg_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NLG_F16
-
-    Inst_VOPC__V_CMPX_NLG_F16::~Inst_VOPC__V_CMPX_NLG_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLG_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NGT_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_NGT_F16::Inst_VOPC__V_CMPX_NGT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ngt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NGT_F16
-
-    Inst_VOPC__V_CMPX_NGT_F16::~Inst_VOPC__V_CMPX_NGT_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NGT_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NLE_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_NLE_F16::Inst_VOPC__V_CMPX_NLE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nle_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NLE_F16
-
-    Inst_VOPC__V_CMPX_NLE_F16::~Inst_VOPC__V_CMPX_NLE_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLE_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NEQ_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_NEQ_F16::Inst_VOPC__V_CMPX_NEQ_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_neq_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NEQ_F16
-
-    Inst_VOPC__V_CMPX_NEQ_F16::~Inst_VOPC__V_CMPX_NEQ_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NEQ_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NLT_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_NLT_F16::Inst_VOPC__V_CMPX_NLT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nlt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NLT_F16
-
-    Inst_VOPC__V_CMPX_NLT_F16::~Inst_VOPC__V_CMPX_NLT_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLT_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_TRU_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_TRU_F16::Inst_VOPC__V_CMPX_TRU_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_tru_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_TRU_F16
-
-    Inst_VOPC__V_CMPX_TRU_F16::~Inst_VOPC__V_CMPX_TRU_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_TRU_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_F_F32 class methods ---
-
-    Inst_VOPC__V_CMP_F_F32::Inst_VOPC__V_CMP_F_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_F_F32
-
-    Inst_VOPC__V_CMP_F_F32::~Inst_VOPC__V_CMP_F_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_F_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LT_F32 class methods ---
-
-    Inst_VOPC__V_CMP_LT_F32::Inst_VOPC__V_CMP_LT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_LT_F32
-
-    Inst_VOPC__V_CMP_LT_F32::~Inst_VOPC__V_CMP_LT_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_EQ_F32 class methods ---
-
-    Inst_VOPC__V_CMP_EQ_F32::Inst_VOPC__V_CMP_EQ_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_EQ_F32
-
-    Inst_VOPC__V_CMP_EQ_F32::~Inst_VOPC__V_CMP_EQ_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LE_F32 class methods ---
-
-    Inst_VOPC__V_CMP_LE_F32::Inst_VOPC__V_CMP_LE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_LE_F32
-
-    Inst_VOPC__V_CMP_LE_F32::~Inst_VOPC__V_CMP_LE_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GT_F32 class methods ---
-
-    Inst_VOPC__V_CMP_GT_F32::Inst_VOPC__V_CMP_GT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_GT_F32
-
-    Inst_VOPC__V_CMP_GT_F32::~Inst_VOPC__V_CMP_GT_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LG_F32 class methods ---
-
-    Inst_VOPC__V_CMP_LG_F32::Inst_VOPC__V_CMP_LG_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lg_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_LG_F32
-
-    Inst_VOPC__V_CMP_LG_F32::~Inst_VOPC__V_CMP_LG_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_LG_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GE_F32 class methods ---
-
-    Inst_VOPC__V_CMP_GE_F32::Inst_VOPC__V_CMP_GE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_GE_F32
-
-    Inst_VOPC__V_CMP_GE_F32::~Inst_VOPC__V_CMP_GE_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_O_F32 class methods ---
-
-    Inst_VOPC__V_CMP_O_F32::Inst_VOPC__V_CMP_O_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_o_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_O_F32
-
-    Inst_VOPC__V_CMP_O_F32::~Inst_VOPC__V_CMP_O_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_O_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_O_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_U_F32 class methods ---
-
-    Inst_VOPC__V_CMP_U_F32::Inst_VOPC__V_CMP_U_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_u_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_U_F32
-
-    Inst_VOPC__V_CMP_U_F32::~Inst_VOPC__V_CMP_U_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_U_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_U_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (std::isnan(src0[lane])
-                    || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NGE_F32 class methods ---
-
-    Inst_VOPC__V_CMP_NGE_F32::Inst_VOPC__V_CMP_NGE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nge_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_NGE_F32
-
-    Inst_VOPC__V_CMP_NGE_F32::~Inst_VOPC__V_CMP_NGE_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_NGE_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NLG_F32 class methods ---
-
-    Inst_VOPC__V_CMP_NLG_F32::Inst_VOPC__V_CMP_NLG_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nlg_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_NLG_F32
-
-    Inst_VOPC__V_CMP_NLG_F32::~Inst_VOPC__V_CMP_NLG_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_NLG_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NGT_F32 class methods ---
-
-    Inst_VOPC__V_CMP_NGT_F32::Inst_VOPC__V_CMP_NGT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ngt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_NGT_F32
-
-    Inst_VOPC__V_CMP_NGT_F32::~Inst_VOPC__V_CMP_NGT_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_NGT_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NLE_F32 class methods ---
-
-    Inst_VOPC__V_CMP_NLE_F32::Inst_VOPC__V_CMP_NLE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nle_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_NLE_F32
-
-    Inst_VOPC__V_CMP_NLE_F32::~Inst_VOPC__V_CMP_NLE_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_NLE_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NEQ_F32 class methods ---
-
-    Inst_VOPC__V_CMP_NEQ_F32::Inst_VOPC__V_CMP_NEQ_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_neq_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_NEQ_F32
-
-    Inst_VOPC__V_CMP_NEQ_F32::~Inst_VOPC__V_CMP_NEQ_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_NEQ_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NLT_F32 class methods ---
-
-    Inst_VOPC__V_CMP_NLT_F32::Inst_VOPC__V_CMP_NLT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nlt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_NLT_F32
-
-    Inst_VOPC__V_CMP_NLT_F32::~Inst_VOPC__V_CMP_NLT_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_NLT_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_TRU_F32 class methods ---
-
-    Inst_VOPC__V_CMP_TRU_F32::Inst_VOPC__V_CMP_TRU_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_tru_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_TRU_F32
-
-    Inst_VOPC__V_CMP_TRU_F32::~Inst_VOPC__V_CMP_TRU_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_TRU_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_F_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_F_F32::Inst_VOPC__V_CMPX_F_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_F_F32
-
-    Inst_VOPC__V_CMPX_F_F32::~Inst_VOPC__V_CMPX_F_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LT_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_LT_F32::Inst_VOPC__V_CMPX_LT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LT_F32
-
-    Inst_VOPC__V_CMPX_LT_F32::~Inst_VOPC__V_CMPX_LT_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_EQ_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_EQ_F32::Inst_VOPC__V_CMPX_EQ_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_EQ_F32
-
-    Inst_VOPC__V_CMPX_EQ_F32::~Inst_VOPC__V_CMPX_EQ_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LE_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_LE_F32::Inst_VOPC__V_CMPX_LE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LE_F32
-
-    Inst_VOPC__V_CMPX_LE_F32::~Inst_VOPC__V_CMPX_LE_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GT_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_GT_F32::Inst_VOPC__V_CMPX_GT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GT_F32
-
-    Inst_VOPC__V_CMPX_GT_F32::~Inst_VOPC__V_CMPX_GT_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LG_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_LG_F32::Inst_VOPC__V_CMPX_LG_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lg_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LG_F32
-
-    Inst_VOPC__V_CMPX_LG_F32::~Inst_VOPC__V_CMPX_LG_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_LG_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GE_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_GE_F32::Inst_VOPC__V_CMPX_GE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GE_F32
-
-    Inst_VOPC__V_CMPX_GE_F32::~Inst_VOPC__V_CMPX_GE_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_O_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_O_F32::Inst_VOPC__V_CMPX_O_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_o_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_O_F32
-
-    Inst_VOPC__V_CMPX_O_F32::~Inst_VOPC__V_CMPX_O_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_O_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOPC__V_CMPX_O_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_U_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_U_F32::Inst_VOPC__V_CMPX_U_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_u_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_U_F32
-
-    Inst_VOPC__V_CMPX_U_F32::~Inst_VOPC__V_CMPX_U_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_U_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOPC__V_CMPX_U_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (std::isnan(src0[lane])
-                    || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NGE_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_NGE_F32::Inst_VOPC__V_CMPX_NGE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nge_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NGE_F32
-
-    Inst_VOPC__V_CMPX_NGE_F32::~Inst_VOPC__V_CMPX_NGE_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NGE_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NLG_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_NLG_F32::Inst_VOPC__V_CMPX_NLG_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nlg_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NLG_F32
-
-    Inst_VOPC__V_CMPX_NLG_F32::~Inst_VOPC__V_CMPX_NLG_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLG_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NGT_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_NGT_F32::Inst_VOPC__V_CMPX_NGT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ngt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NGT_F32
-
-    Inst_VOPC__V_CMPX_NGT_F32::~Inst_VOPC__V_CMPX_NGT_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NGT_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NLE_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_NLE_F32::Inst_VOPC__V_CMPX_NLE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nle_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NLE_F32
-
-    Inst_VOPC__V_CMPX_NLE_F32::~Inst_VOPC__V_CMPX_NLE_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLE_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NEQ_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_NEQ_F32::Inst_VOPC__V_CMPX_NEQ_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_neq_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NEQ_F32
-
-    Inst_VOPC__V_CMPX_NEQ_F32::~Inst_VOPC__V_CMPX_NEQ_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NEQ_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] == src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NLT_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_NLT_F32::Inst_VOPC__V_CMPX_NLT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nlt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NLT_F32
-
-    Inst_VOPC__V_CMPX_NLT_F32::~Inst_VOPC__V_CMPX_NLT_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLT_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_TRU_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_TRU_F32::Inst_VOPC__V_CMPX_TRU_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_tru_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_TRU_F32
-
-    Inst_VOPC__V_CMPX_TRU_F32::~Inst_VOPC__V_CMPX_TRU_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_TRU_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMP_F_F64 class methods ---
-
-    Inst_VOPC__V_CMP_F_F64::Inst_VOPC__V_CMP_F_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_F_F64
-
-    Inst_VOPC__V_CMP_F_F64::~Inst_VOPC__V_CMP_F_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_F_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LT_F64 class methods ---
-
-    Inst_VOPC__V_CMP_LT_F64::Inst_VOPC__V_CMP_LT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_LT_F64
-
-    Inst_VOPC__V_CMP_LT_F64::~Inst_VOPC__V_CMP_LT_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_EQ_F64 class methods ---
-
-    Inst_VOPC__V_CMP_EQ_F64::Inst_VOPC__V_CMP_EQ_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_EQ_F64
-
-    Inst_VOPC__V_CMP_EQ_F64::~Inst_VOPC__V_CMP_EQ_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LE_F64 class methods ---
-
-    Inst_VOPC__V_CMP_LE_F64::Inst_VOPC__V_CMP_LE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_LE_F64
-
-    Inst_VOPC__V_CMP_LE_F64::~Inst_VOPC__V_CMP_LE_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GT_F64 class methods ---
-
-    Inst_VOPC__V_CMP_GT_F64::Inst_VOPC__V_CMP_GT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_GT_F64
-
-    Inst_VOPC__V_CMP_GT_F64::~Inst_VOPC__V_CMP_GT_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LG_F64 class methods ---
-
-    Inst_VOPC__V_CMP_LG_F64::Inst_VOPC__V_CMP_LG_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lg_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_LG_F64
-
-    Inst_VOPC__V_CMP_LG_F64::~Inst_VOPC__V_CMP_LG_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_LG_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GE_F64 class methods ---
-
-    Inst_VOPC__V_CMP_GE_F64::Inst_VOPC__V_CMP_GE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_GE_F64
-
-    Inst_VOPC__V_CMP_GE_F64::~Inst_VOPC__V_CMP_GE_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_O_F64 class methods ---
-
-    Inst_VOPC__V_CMP_O_F64::Inst_VOPC__V_CMP_O_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_o_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_O_F64
-
-    Inst_VOPC__V_CMP_O_F64::~Inst_VOPC__V_CMP_O_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_O_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_O_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_U_F64 class methods ---
-
-    Inst_VOPC__V_CMP_U_F64::Inst_VOPC__V_CMP_U_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_u_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_U_F64
-
-    Inst_VOPC__V_CMP_U_F64::~Inst_VOPC__V_CMP_U_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_U_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_U_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (std::isnan(src0[lane])
-                    || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NGE_F64 class methods ---
-
-    Inst_VOPC__V_CMP_NGE_F64::Inst_VOPC__V_CMP_NGE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nge_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_NGE_F64
-
-    Inst_VOPC__V_CMP_NGE_F64::~Inst_VOPC__V_CMP_NGE_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_NGE_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NLG_F64 class methods ---
-
-    Inst_VOPC__V_CMP_NLG_F64::Inst_VOPC__V_CMP_NLG_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nlg_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_NLG_F64
-
-    Inst_VOPC__V_CMP_NLG_F64::~Inst_VOPC__V_CMP_NLG_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_NLG_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NGT_F64 class methods ---
-
-    Inst_VOPC__V_CMP_NGT_F64::Inst_VOPC__V_CMP_NGT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ngt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_NGT_F64
-
-    Inst_VOPC__V_CMP_NGT_F64::~Inst_VOPC__V_CMP_NGT_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_NGT_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NLE_F64 class methods ---
-
-    Inst_VOPC__V_CMP_NLE_F64::Inst_VOPC__V_CMP_NLE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nle_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_NLE_F64
-
-    Inst_VOPC__V_CMP_NLE_F64::~Inst_VOPC__V_CMP_NLE_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_NLE_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NEQ_F64 class methods ---
-
-    Inst_VOPC__V_CMP_NEQ_F64::Inst_VOPC__V_CMP_NEQ_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_neq_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_NEQ_F64
-
-    Inst_VOPC__V_CMP_NEQ_F64::~Inst_VOPC__V_CMP_NEQ_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_NEQ_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NLT_F64 class methods ---
-
-    Inst_VOPC__V_CMP_NLT_F64::Inst_VOPC__V_CMP_NLT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nlt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_NLT_F64
-
-    Inst_VOPC__V_CMP_NLT_F64::~Inst_VOPC__V_CMP_NLT_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_NLT_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_TRU_F64 class methods ---
-
-    Inst_VOPC__V_CMP_TRU_F64::Inst_VOPC__V_CMP_TRU_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_tru_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_TRU_F64
-
-    Inst_VOPC__V_CMP_TRU_F64::~Inst_VOPC__V_CMP_TRU_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_TRU_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_F_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_F_F64::Inst_VOPC__V_CMPX_F_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_F_F64
-
-    Inst_VOPC__V_CMPX_F_F64::~Inst_VOPC__V_CMPX_F_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LT_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_LT_F64::Inst_VOPC__V_CMPX_LT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LT_F64
-
-    Inst_VOPC__V_CMPX_LT_F64::~Inst_VOPC__V_CMPX_LT_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_EQ_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_EQ_F64::Inst_VOPC__V_CMPX_EQ_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_EQ_F64
-
-    Inst_VOPC__V_CMPX_EQ_F64::~Inst_VOPC__V_CMPX_EQ_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LE_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_LE_F64::Inst_VOPC__V_CMPX_LE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LE_F64
-
-    Inst_VOPC__V_CMPX_LE_F64::~Inst_VOPC__V_CMPX_LE_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GT_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_GT_F64::Inst_VOPC__V_CMPX_GT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GT_F64
-
-    Inst_VOPC__V_CMPX_GT_F64::~Inst_VOPC__V_CMPX_GT_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LG_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_LG_F64::Inst_VOPC__V_CMPX_LG_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lg_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LG_F64
-
-    Inst_VOPC__V_CMPX_LG_F64::~Inst_VOPC__V_CMPX_LG_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_LG_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GE_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_GE_F64::Inst_VOPC__V_CMPX_GE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GE_F64
-
-    Inst_VOPC__V_CMPX_GE_F64::~Inst_VOPC__V_CMPX_GE_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_O_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_O_F64::Inst_VOPC__V_CMPX_O_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_o_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_O_F64
-
-    Inst_VOPC__V_CMPX_O_F64::~Inst_VOPC__V_CMPX_O_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_O_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOPC__V_CMPX_O_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_U_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_U_F64::Inst_VOPC__V_CMPX_U_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_u_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_U_F64
-
-    Inst_VOPC__V_CMPX_U_F64::~Inst_VOPC__V_CMPX_U_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_U_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOPC__V_CMPX_U_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (std::isnan(src0[lane])
-                    || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NGE_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_NGE_F64::Inst_VOPC__V_CMPX_NGE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nge_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NGE_F64
-
-    Inst_VOPC__V_CMPX_NGE_F64::~Inst_VOPC__V_CMPX_NGE_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NGE_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NLG_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_NLG_F64::Inst_VOPC__V_CMPX_NLG_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nlg_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NLG_F64
-
-    Inst_VOPC__V_CMPX_NLG_F64::~Inst_VOPC__V_CMPX_NLG_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLG_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NGT_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_NGT_F64::Inst_VOPC__V_CMPX_NGT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ngt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NGT_F64
-
-    Inst_VOPC__V_CMPX_NGT_F64::~Inst_VOPC__V_CMPX_NGT_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NGT_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NLE_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_NLE_F64::Inst_VOPC__V_CMPX_NLE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nle_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NLE_F64
-
-    Inst_VOPC__V_CMPX_NLE_F64::~Inst_VOPC__V_CMPX_NLE_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLE_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NEQ_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_NEQ_F64::Inst_VOPC__V_CMPX_NEQ_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_neq_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NEQ_F64
-
-    Inst_VOPC__V_CMPX_NEQ_F64::~Inst_VOPC__V_CMPX_NEQ_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NEQ_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NLT_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_NLT_F64::Inst_VOPC__V_CMPX_NLT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nlt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NLT_F64
-
-    Inst_VOPC__V_CMPX_NLT_F64::~Inst_VOPC__V_CMPX_NLT_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLT_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_TRU_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_TRU_F64::Inst_VOPC__V_CMPX_TRU_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_tru_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_TRU_F64
-
-    Inst_VOPC__V_CMPX_TRU_F64::~Inst_VOPC__V_CMPX_TRU_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_TRU_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_F_I16 class methods ---
-
-    Inst_VOPC__V_CMP_F_I16::Inst_VOPC__V_CMP_F_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_F_I16
-
-    Inst_VOPC__V_CMP_F_I16::~Inst_VOPC__V_CMP_F_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_F_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LT_I16 class methods ---
-
-    Inst_VOPC__V_CMP_LT_I16::Inst_VOPC__V_CMP_LT_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LT_I16
-
-    Inst_VOPC__V_CMP_LT_I16::~Inst_VOPC__V_CMP_LT_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_EQ_I16 class methods ---
-
-    Inst_VOPC__V_CMP_EQ_I16::Inst_VOPC__V_CMP_EQ_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_EQ_I16
-
-    Inst_VOPC__V_CMP_EQ_I16::~Inst_VOPC__V_CMP_EQ_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LE_I16 class methods ---
-
-    Inst_VOPC__V_CMP_LE_I16::Inst_VOPC__V_CMP_LE_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LE_I16
-
-    Inst_VOPC__V_CMP_LE_I16::~Inst_VOPC__V_CMP_LE_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GT_I16 class methods ---
-
-    Inst_VOPC__V_CMP_GT_I16::Inst_VOPC__V_CMP_GT_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GT_I16
-
-    Inst_VOPC__V_CMP_GT_I16::~Inst_VOPC__V_CMP_GT_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NE_I16 class methods ---
-
-    Inst_VOPC__V_CMP_NE_I16::Inst_VOPC__V_CMP_NE_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ne_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_NE_I16
-
-    Inst_VOPC__V_CMP_NE_I16::~Inst_VOPC__V_CMP_NE_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_NE_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GE_I16 class methods ---
-
-    Inst_VOPC__V_CMP_GE_I16::Inst_VOPC__V_CMP_GE_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GE_I16
-
-    Inst_VOPC__V_CMP_GE_I16::~Inst_VOPC__V_CMP_GE_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_T_I16 class methods ---
-
-    Inst_VOPC__V_CMP_T_I16::Inst_VOPC__V_CMP_T_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_t_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_T_I16
-
-    Inst_VOPC__V_CMP_T_I16::~Inst_VOPC__V_CMP_T_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_T_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_T_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_F_U16 class methods ---
-
-    Inst_VOPC__V_CMP_F_U16::Inst_VOPC__V_CMP_F_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_F_U16
-
-    Inst_VOPC__V_CMP_F_U16::~Inst_VOPC__V_CMP_F_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_F_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LT_U16 class methods ---
-
-    Inst_VOPC__V_CMP_LT_U16::Inst_VOPC__V_CMP_LT_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LT_U16
-
-    Inst_VOPC__V_CMP_LT_U16::~Inst_VOPC__V_CMP_LT_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_EQ_U16 class methods ---
-
-    Inst_VOPC__V_CMP_EQ_U16::Inst_VOPC__V_CMP_EQ_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_EQ_U16
-
-    Inst_VOPC__V_CMP_EQ_U16::~Inst_VOPC__V_CMP_EQ_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LE_U16 class methods ---
-
-    Inst_VOPC__V_CMP_LE_U16::Inst_VOPC__V_CMP_LE_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LE_U16
-
-    Inst_VOPC__V_CMP_LE_U16::~Inst_VOPC__V_CMP_LE_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GT_U16 class methods ---
-
-    Inst_VOPC__V_CMP_GT_U16::Inst_VOPC__V_CMP_GT_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GT_U16
-
-    Inst_VOPC__V_CMP_GT_U16::~Inst_VOPC__V_CMP_GT_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NE_U16 class methods ---
-
-    Inst_VOPC__V_CMP_NE_U16::Inst_VOPC__V_CMP_NE_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ne_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_NE_U16
-
-    Inst_VOPC__V_CMP_NE_U16::~Inst_VOPC__V_CMP_NE_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_NE_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GE_U16 class methods ---
-
-    Inst_VOPC__V_CMP_GE_U16::Inst_VOPC__V_CMP_GE_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GE_U16
-
-    Inst_VOPC__V_CMP_GE_U16::~Inst_VOPC__V_CMP_GE_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_T_U16 class methods ---
-
-    Inst_VOPC__V_CMP_T_U16::Inst_VOPC__V_CMP_T_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_t_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_T_U16
-
-    Inst_VOPC__V_CMP_T_U16::~Inst_VOPC__V_CMP_T_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_T_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_T_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_F_I16 class methods ---
-
-    Inst_VOPC__V_CMPX_F_I16::Inst_VOPC__V_CMPX_F_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_i16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_F_I16
-
-    Inst_VOPC__V_CMPX_F_I16::~Inst_VOPC__V_CMPX_F_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LT_I16 class methods ---
-
-    Inst_VOPC__V_CMPX_LT_I16::Inst_VOPC__V_CMPX_LT_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_i16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LT_I16
-
-    Inst_VOPC__V_CMPX_LT_I16::~Inst_VOPC__V_CMPX_LT_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_EQ_I16 class methods ---
-
-    Inst_VOPC__V_CMPX_EQ_I16::Inst_VOPC__V_CMPX_EQ_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_i16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_EQ_I16
-
-    Inst_VOPC__V_CMPX_EQ_I16::~Inst_VOPC__V_CMPX_EQ_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LE_I16 class methods ---
-
-    Inst_VOPC__V_CMPX_LE_I16::Inst_VOPC__V_CMPX_LE_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_i16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LE_I16
-
-    Inst_VOPC__V_CMPX_LE_I16::~Inst_VOPC__V_CMPX_LE_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GT_I16 class methods ---
-
-    Inst_VOPC__V_CMPX_GT_I16::Inst_VOPC__V_CMPX_GT_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_i16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GT_I16
-
-    Inst_VOPC__V_CMPX_GT_I16::~Inst_VOPC__V_CMPX_GT_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NE_I16 class methods ---
-
-    Inst_VOPC__V_CMPX_NE_I16::Inst_VOPC__V_CMPX_NE_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ne_i16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NE_I16
-
-    Inst_VOPC__V_CMPX_NE_I16::~Inst_VOPC__V_CMPX_NE_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NE_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GE_I16 class methods ---
-
-    Inst_VOPC__V_CMPX_GE_I16::Inst_VOPC__V_CMPX_GE_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_i16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GE_I16
-
-    Inst_VOPC__V_CMPX_GE_I16::~Inst_VOPC__V_CMPX_GE_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_T_I16 class methods ---
-
-    Inst_VOPC__V_CMPX_T_I16::Inst_VOPC__V_CMPX_T_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_t_i16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_T_I16
-
-    Inst_VOPC__V_CMPX_T_I16::~Inst_VOPC__V_CMPX_T_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_T_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_T_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_F_U16 class methods ---
-
-    Inst_VOPC__V_CMPX_F_U16::Inst_VOPC__V_CMPX_F_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_u16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_F_U16
-
-    Inst_VOPC__V_CMPX_F_U16::~Inst_VOPC__V_CMPX_F_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LT_U16 class methods ---
-
-    Inst_VOPC__V_CMPX_LT_U16::Inst_VOPC__V_CMPX_LT_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_u16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LT_U16
-
-    Inst_VOPC__V_CMPX_LT_U16::~Inst_VOPC__V_CMPX_LT_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_EQ_U16 class methods ---
-
-    Inst_VOPC__V_CMPX_EQ_U16::Inst_VOPC__V_CMPX_EQ_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_u16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_EQ_U16
-
-    Inst_VOPC__V_CMPX_EQ_U16::~Inst_VOPC__V_CMPX_EQ_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LE_U16 class methods ---
-
-    Inst_VOPC__V_CMPX_LE_U16::Inst_VOPC__V_CMPX_LE_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_u16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LE_U16
-
-    Inst_VOPC__V_CMPX_LE_U16::~Inst_VOPC__V_CMPX_LE_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GT_U16 class methods ---
-
-    Inst_VOPC__V_CMPX_GT_U16::Inst_VOPC__V_CMPX_GT_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_u16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GT_U16
-
-    Inst_VOPC__V_CMPX_GT_U16::~Inst_VOPC__V_CMPX_GT_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NE_U16 class methods ---
-
-    Inst_VOPC__V_CMPX_NE_U16::Inst_VOPC__V_CMPX_NE_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ne_u16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NE_U16
-
-    Inst_VOPC__V_CMPX_NE_U16::~Inst_VOPC__V_CMPX_NE_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NE_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GE_U16 class methods ---
-
-    Inst_VOPC__V_CMPX_GE_U16::Inst_VOPC__V_CMPX_GE_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_u16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GE_U16
-
-    Inst_VOPC__V_CMPX_GE_U16::~Inst_VOPC__V_CMPX_GE_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_T_U16 class methods ---
-
-    Inst_VOPC__V_CMPX_T_U16::Inst_VOPC__V_CMPX_T_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_t_u16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_T_U16
-
-    Inst_VOPC__V_CMPX_T_U16::~Inst_VOPC__V_CMPX_T_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_T_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_T_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_F_I32 class methods ---
-
-    Inst_VOPC__V_CMP_F_I32::Inst_VOPC__V_CMP_F_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_F_I32
-
-    Inst_VOPC__V_CMP_F_I32::~Inst_VOPC__V_CMP_F_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_F_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LT_I32 class methods ---
-
-    Inst_VOPC__V_CMP_LT_I32::Inst_VOPC__V_CMP_LT_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LT_I32
-
-    Inst_VOPC__V_CMP_LT_I32::~Inst_VOPC__V_CMP_LT_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_EQ_I32 class methods ---
-
-    Inst_VOPC__V_CMP_EQ_I32::Inst_VOPC__V_CMP_EQ_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_EQ_I32
-
-    Inst_VOPC__V_CMP_EQ_I32::~Inst_VOPC__V_CMP_EQ_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LE_I32 class methods ---
-
-    Inst_VOPC__V_CMP_LE_I32::Inst_VOPC__V_CMP_LE_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LE_I32
-
-    Inst_VOPC__V_CMP_LE_I32::~Inst_VOPC__V_CMP_LE_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GT_I32 class methods ---
-
-    Inst_VOPC__V_CMP_GT_I32::Inst_VOPC__V_CMP_GT_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GT_I32
-
-    Inst_VOPC__V_CMP_GT_I32::~Inst_VOPC__V_CMP_GT_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NE_I32 class methods ---
-
-    Inst_VOPC__V_CMP_NE_I32::Inst_VOPC__V_CMP_NE_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ne_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_NE_I32
-
-    Inst_VOPC__V_CMP_NE_I32::~Inst_VOPC__V_CMP_NE_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_NE_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GE_I32 class methods ---
-
-    Inst_VOPC__V_CMP_GE_I32::Inst_VOPC__V_CMP_GE_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GE_I32
-
-    Inst_VOPC__V_CMP_GE_I32::~Inst_VOPC__V_CMP_GE_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_T_I32 class methods ---
-
-    Inst_VOPC__V_CMP_T_I32::Inst_VOPC__V_CMP_T_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_t_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_T_I32
-
-    Inst_VOPC__V_CMP_T_I32::~Inst_VOPC__V_CMP_T_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_T_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_T_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_F_U32 class methods ---
-
-    Inst_VOPC__V_CMP_F_U32::Inst_VOPC__V_CMP_F_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_F_U32
-
-    Inst_VOPC__V_CMP_F_U32::~Inst_VOPC__V_CMP_F_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_F_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LT_U32 class methods ---
-
-    Inst_VOPC__V_CMP_LT_U32::Inst_VOPC__V_CMP_LT_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LT_U32
-
-    Inst_VOPC__V_CMP_LT_U32::~Inst_VOPC__V_CMP_LT_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_EQ_U32 class methods ---
-
-    Inst_VOPC__V_CMP_EQ_U32::Inst_VOPC__V_CMP_EQ_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_EQ_U32
-
-    Inst_VOPC__V_CMP_EQ_U32::~Inst_VOPC__V_CMP_EQ_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LE_U32 class methods ---
-
-    Inst_VOPC__V_CMP_LE_U32::Inst_VOPC__V_CMP_LE_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LE_U32
-
-    Inst_VOPC__V_CMP_LE_U32::~Inst_VOPC__V_CMP_LE_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GT_U32 class methods ---
-
-    Inst_VOPC__V_CMP_GT_U32::Inst_VOPC__V_CMP_GT_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GT_U32
-
-    Inst_VOPC__V_CMP_GT_U32::~Inst_VOPC__V_CMP_GT_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NE_U32 class methods ---
-
-    Inst_VOPC__V_CMP_NE_U32::Inst_VOPC__V_CMP_NE_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ne_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_NE_U32
-
-    Inst_VOPC__V_CMP_NE_U32::~Inst_VOPC__V_CMP_NE_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_NE_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GE_U32 class methods ---
-
-    Inst_VOPC__V_CMP_GE_U32::Inst_VOPC__V_CMP_GE_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GE_U32
-
-    Inst_VOPC__V_CMP_GE_U32::~Inst_VOPC__V_CMP_GE_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_T_U32 class methods ---
-
-    Inst_VOPC__V_CMP_T_U32::Inst_VOPC__V_CMP_T_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_t_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_T_U32
-
-    Inst_VOPC__V_CMP_T_U32::~Inst_VOPC__V_CMP_T_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_T_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_T_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_F_I32 class methods ---
-
-    Inst_VOPC__V_CMPX_F_I32::Inst_VOPC__V_CMPX_F_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_i32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_F_I32
-
-    Inst_VOPC__V_CMPX_F_I32::~Inst_VOPC__V_CMPX_F_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LT_I32 class methods ---
-
-    Inst_VOPC__V_CMPX_LT_I32::Inst_VOPC__V_CMPX_LT_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_i32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LT_I32
-
-    Inst_VOPC__V_CMPX_LT_I32::~Inst_VOPC__V_CMPX_LT_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_EQ_I32 class methods ---
-
-    Inst_VOPC__V_CMPX_EQ_I32::Inst_VOPC__V_CMPX_EQ_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_i32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_EQ_I32
-
-    Inst_VOPC__V_CMPX_EQ_I32::~Inst_VOPC__V_CMPX_EQ_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LE_I32 class methods ---
-
-    Inst_VOPC__V_CMPX_LE_I32::Inst_VOPC__V_CMPX_LE_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_i32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LE_I32
-
-    Inst_VOPC__V_CMPX_LE_I32::~Inst_VOPC__V_CMPX_LE_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GT_I32 class methods ---
-
-    Inst_VOPC__V_CMPX_GT_I32::Inst_VOPC__V_CMPX_GT_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_i32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GT_I32
-
-    Inst_VOPC__V_CMPX_GT_I32::~Inst_VOPC__V_CMPX_GT_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NE_I32 class methods ---
-
-    Inst_VOPC__V_CMPX_NE_I32::Inst_VOPC__V_CMPX_NE_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ne_i32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NE_I32
-
-    Inst_VOPC__V_CMPX_NE_I32::~Inst_VOPC__V_CMPX_NE_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NE_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GE_I32 class methods ---
-
-    Inst_VOPC__V_CMPX_GE_I32::Inst_VOPC__V_CMPX_GE_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_i32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GE_I32
-
-    Inst_VOPC__V_CMPX_GE_I32::~Inst_VOPC__V_CMPX_GE_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_T_I32 class methods ---
-
-    Inst_VOPC__V_CMPX_T_I32::Inst_VOPC__V_CMPX_T_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_t_i32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_T_I32
-
-    Inst_VOPC__V_CMPX_T_I32::~Inst_VOPC__V_CMPX_T_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_T_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_T_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_F_U32 class methods ---
-
-    Inst_VOPC__V_CMPX_F_U32::Inst_VOPC__V_CMPX_F_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_F_U32
-
-    Inst_VOPC__V_CMPX_F_U32::~Inst_VOPC__V_CMPX_F_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LT_U32 class methods ---
-
-    Inst_VOPC__V_CMPX_LT_U32::Inst_VOPC__V_CMPX_LT_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LT_U32
-
-    Inst_VOPC__V_CMPX_LT_U32::~Inst_VOPC__V_CMPX_LT_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_EQ_U32 class methods ---
-
-    Inst_VOPC__V_CMPX_EQ_U32::Inst_VOPC__V_CMPX_EQ_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_EQ_U32
-
-    Inst_VOPC__V_CMPX_EQ_U32::~Inst_VOPC__V_CMPX_EQ_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LE_U32 class methods ---
-
-    Inst_VOPC__V_CMPX_LE_U32::Inst_VOPC__V_CMPX_LE_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LE_U32
-
-    Inst_VOPC__V_CMPX_LE_U32::~Inst_VOPC__V_CMPX_LE_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GT_U32 class methods ---
-
-    Inst_VOPC__V_CMPX_GT_U32::Inst_VOPC__V_CMPX_GT_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GT_U32
-
-    Inst_VOPC__V_CMPX_GT_U32::~Inst_VOPC__V_CMPX_GT_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NE_U32 class methods ---
-
-    Inst_VOPC__V_CMPX_NE_U32::Inst_VOPC__V_CMPX_NE_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ne_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NE_U32
-
-    Inst_VOPC__V_CMPX_NE_U32::~Inst_VOPC__V_CMPX_NE_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NE_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GE_U32 class methods ---
-
-    Inst_VOPC__V_CMPX_GE_U32::Inst_VOPC__V_CMPX_GE_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GE_U32
-
-    Inst_VOPC__V_CMPX_GE_U32::~Inst_VOPC__V_CMPX_GE_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_T_U32 class methods ---
-
-    Inst_VOPC__V_CMPX_T_U32::Inst_VOPC__V_CMPX_T_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_t_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_T_U32
-
-    Inst_VOPC__V_CMPX_T_U32::~Inst_VOPC__V_CMPX_T_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_T_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_T_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_F_I64 class methods ---
-
-    Inst_VOPC__V_CMP_F_I64::Inst_VOPC__V_CMP_F_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_F_I64
-
-    Inst_VOPC__V_CMP_F_I64::~Inst_VOPC__V_CMP_F_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_F_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LT_I64 class methods ---
-
-    Inst_VOPC__V_CMP_LT_I64::Inst_VOPC__V_CMP_LT_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LT_I64
-
-    Inst_VOPC__V_CMP_LT_I64::~Inst_VOPC__V_CMP_LT_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_EQ_I64 class methods ---
-
-    Inst_VOPC__V_CMP_EQ_I64::Inst_VOPC__V_CMP_EQ_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_EQ_I64
-
-    Inst_VOPC__V_CMP_EQ_I64::~Inst_VOPC__V_CMP_EQ_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LE_I64 class methods ---
-
-    Inst_VOPC__V_CMP_LE_I64::Inst_VOPC__V_CMP_LE_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LE_I64
-
-    Inst_VOPC__V_CMP_LE_I64::~Inst_VOPC__V_CMP_LE_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GT_I64 class methods ---
-
-    Inst_VOPC__V_CMP_GT_I64::Inst_VOPC__V_CMP_GT_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GT_I64
-
-    Inst_VOPC__V_CMP_GT_I64::~Inst_VOPC__V_CMP_GT_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NE_I64 class methods ---
-
-    Inst_VOPC__V_CMP_NE_I64::Inst_VOPC__V_CMP_NE_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ne_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_NE_I64
-
-    Inst_VOPC__V_CMP_NE_I64::~Inst_VOPC__V_CMP_NE_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_NE_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GE_I64 class methods ---
-
-    Inst_VOPC__V_CMP_GE_I64::Inst_VOPC__V_CMP_GE_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GE_I64
-
-    Inst_VOPC__V_CMP_GE_I64::~Inst_VOPC__V_CMP_GE_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_T_I64 class methods ---
-
-    Inst_VOPC__V_CMP_T_I64::Inst_VOPC__V_CMP_T_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_t_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_T_I64
-
-    Inst_VOPC__V_CMP_T_I64::~Inst_VOPC__V_CMP_T_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_T_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_T_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_F_U64 class methods ---
-
-    Inst_VOPC__V_CMP_F_U64::Inst_VOPC__V_CMP_F_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_F_U64
-
-    Inst_VOPC__V_CMP_F_U64::~Inst_VOPC__V_CMP_F_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_F_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LT_U64 class methods ---
-
-    Inst_VOPC__V_CMP_LT_U64::Inst_VOPC__V_CMP_LT_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LT_U64
-
-    Inst_VOPC__V_CMP_LT_U64::~Inst_VOPC__V_CMP_LT_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_EQ_U64 class methods ---
-
-    Inst_VOPC__V_CMP_EQ_U64::Inst_VOPC__V_CMP_EQ_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_EQ_U64
-
-    Inst_VOPC__V_CMP_EQ_U64::~Inst_VOPC__V_CMP_EQ_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LE_U64 class methods ---
-
-    Inst_VOPC__V_CMP_LE_U64::Inst_VOPC__V_CMP_LE_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LE_U64
-
-    Inst_VOPC__V_CMP_LE_U64::~Inst_VOPC__V_CMP_LE_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GT_U64 class methods ---
-
-    Inst_VOPC__V_CMP_GT_U64::Inst_VOPC__V_CMP_GT_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GT_U64
-
-    Inst_VOPC__V_CMP_GT_U64::~Inst_VOPC__V_CMP_GT_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NE_U64 class methods ---
-
-    Inst_VOPC__V_CMP_NE_U64::Inst_VOPC__V_CMP_NE_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ne_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_NE_U64
-
-    Inst_VOPC__V_CMP_NE_U64::~Inst_VOPC__V_CMP_NE_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_NE_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GE_U64 class methods ---
-
-    Inst_VOPC__V_CMP_GE_U64::Inst_VOPC__V_CMP_GE_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GE_U64
-
-    Inst_VOPC__V_CMP_GE_U64::~Inst_VOPC__V_CMP_GE_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_T_U64 class methods ---
-
-    Inst_VOPC__V_CMP_T_U64::Inst_VOPC__V_CMP_T_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_t_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_T_U64
-
-    Inst_VOPC__V_CMP_T_U64::~Inst_VOPC__V_CMP_T_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_T_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_T_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_F_I64 class methods ---
-
-    Inst_VOPC__V_CMPX_F_I64::Inst_VOPC__V_CMPX_F_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_i64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_F_I64
-
-    Inst_VOPC__V_CMPX_F_I64::~Inst_VOPC__V_CMPX_F_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LT_I64 class methods ---
-
-    Inst_VOPC__V_CMPX_LT_I64::Inst_VOPC__V_CMPX_LT_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_i64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LT_I64
-
-    Inst_VOPC__V_CMPX_LT_I64::~Inst_VOPC__V_CMPX_LT_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_EQ_I64 class methods ---
-
-    Inst_VOPC__V_CMPX_EQ_I64::Inst_VOPC__V_CMPX_EQ_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_i64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_EQ_I64
-
-    Inst_VOPC__V_CMPX_EQ_I64::~Inst_VOPC__V_CMPX_EQ_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LE_I64 class methods ---
-
-    Inst_VOPC__V_CMPX_LE_I64::Inst_VOPC__V_CMPX_LE_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_i64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LE_I64
-
-    Inst_VOPC__V_CMPX_LE_I64::~Inst_VOPC__V_CMPX_LE_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GT_I64 class methods ---
-
-    Inst_VOPC__V_CMPX_GT_I64::Inst_VOPC__V_CMPX_GT_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_i64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GT_I64
-
-    Inst_VOPC__V_CMPX_GT_I64::~Inst_VOPC__V_CMPX_GT_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NE_I64 class methods ---
-
-    Inst_VOPC__V_CMPX_NE_I64::Inst_VOPC__V_CMPX_NE_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ne_i64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NE_I64
-
-    Inst_VOPC__V_CMPX_NE_I64::~Inst_VOPC__V_CMPX_NE_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NE_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GE_I64 class methods ---
-
-    Inst_VOPC__V_CMPX_GE_I64::Inst_VOPC__V_CMPX_GE_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_i64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GE_I64
-
-    Inst_VOPC__V_CMPX_GE_I64::~Inst_VOPC__V_CMPX_GE_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_T_I64 class methods ---
-
-    Inst_VOPC__V_CMPX_T_I64::Inst_VOPC__V_CMPX_T_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_t_i64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_T_I64
-
-    Inst_VOPC__V_CMPX_T_I64::~Inst_VOPC__V_CMPX_T_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_T_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_T_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_F_U64 class methods ---
-
-    Inst_VOPC__V_CMPX_F_U64::Inst_VOPC__V_CMPX_F_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_u64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_F_U64
-
-    Inst_VOPC__V_CMPX_F_U64::~Inst_VOPC__V_CMPX_F_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LT_U64 class methods ---
-
-    Inst_VOPC__V_CMPX_LT_U64::Inst_VOPC__V_CMPX_LT_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_u64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LT_U64
-
-    Inst_VOPC__V_CMPX_LT_U64::~Inst_VOPC__V_CMPX_LT_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_EQ_U64 class methods ---
-
-    Inst_VOPC__V_CMPX_EQ_U64::Inst_VOPC__V_CMPX_EQ_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_u64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_EQ_U64
-
-    Inst_VOPC__V_CMPX_EQ_U64::~Inst_VOPC__V_CMPX_EQ_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LE_U64 class methods ---
-
-    Inst_VOPC__V_CMPX_LE_U64::Inst_VOPC__V_CMPX_LE_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_u64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LE_U64
-
-    Inst_VOPC__V_CMPX_LE_U64::~Inst_VOPC__V_CMPX_LE_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GT_U64 class methods ---
-
-    Inst_VOPC__V_CMPX_GT_U64::Inst_VOPC__V_CMPX_GT_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_u64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GT_U64
-
-    Inst_VOPC__V_CMPX_GT_U64::~Inst_VOPC__V_CMPX_GT_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NE_U64 class methods ---
-
-    Inst_VOPC__V_CMPX_NE_U64::Inst_VOPC__V_CMPX_NE_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ne_u64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NE_U64
-
-    Inst_VOPC__V_CMPX_NE_U64::~Inst_VOPC__V_CMPX_NE_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NE_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GE_U64 class methods ---
-
-    Inst_VOPC__V_CMPX_GE_U64::Inst_VOPC__V_CMPX_GE_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_u64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GE_U64
-
-    Inst_VOPC__V_CMPX_GE_U64::~Inst_VOPC__V_CMPX_GE_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_T_U64 class methods ---
-
-    Inst_VOPC__V_CMPX_T_U64::Inst_VOPC__V_CMPX_T_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_t_u64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_T_U64
-
-    Inst_VOPC__V_CMPX_T_U64::~Inst_VOPC__V_CMPX_T_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_T_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_T_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VINTRP__V_INTERP_P1_F32 class methods ---
-
-    Inst_VINTRP__V_INTERP_P1_F32::Inst_VINTRP__V_INTERP_P1_F32(
-          InFmt_VINTRP *iFmt)
-        : Inst_VINTRP(iFmt, "v_interp_p1_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VINTRP__V_INTERP_P1_F32
-
-    Inst_VINTRP__V_INTERP_P1_F32::~Inst_VINTRP__V_INTERP_P1_F32()
-    {
-    } // ~Inst_VINTRP__V_INTERP_P1_F32
-
-    // --- description from .arch file ---
-    // D.f = P10 * S.f + P0; parameter interpolation (SQ translates to
-    // V_MAD_F32 for SP).
-    // CAUTION: when in HALF_LDS mode, D must not be the same GPR as S;
-    // if D == S then data corruption will occur.
-    // NOTE: In textual representations the I/J VGPR is the first source and
-    // the attribute is the second source; however in the VOP3 encoding the
-    // attribute is stored in the src0 field and the VGPR is stored in the
-    // src1 field.
-    void
-    Inst_VINTRP__V_INTERP_P1_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VINTRP__V_INTERP_P2_F32 class methods ---
-
-    Inst_VINTRP__V_INTERP_P2_F32::Inst_VINTRP__V_INTERP_P2_F32(
-          InFmt_VINTRP *iFmt)
-        : Inst_VINTRP(iFmt, "v_interp_p2_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VINTRP__V_INTERP_P2_F32
-
-    Inst_VINTRP__V_INTERP_P2_F32::~Inst_VINTRP__V_INTERP_P2_F32()
-    {
-    } // ~Inst_VINTRP__V_INTERP_P2_F32
-
-    // --- description from .arch file ---
-    // D.f = P20 * S.f + D.f; parameter interpolation (SQ translates to
-    // V_MAD_F32 for SP).
-    // NOTE: In textual representations the I/J VGPR is the first source and
-    // the attribute is the second source; however in the VOP3 encoding the
-    // attribute is stored in the src0 field and the VGPR is stored in the
-    // src1 field.
-    void
-    Inst_VINTRP__V_INTERP_P2_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VINTRP__V_INTERP_MOV_F32 class methods ---
-
-    Inst_VINTRP__V_INTERP_MOV_F32::Inst_VINTRP__V_INTERP_MOV_F32(
-          InFmt_VINTRP *iFmt)
-        : Inst_VINTRP(iFmt, "v_interp_mov_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VINTRP__V_INTERP_MOV_F32
-
-    Inst_VINTRP__V_INTERP_MOV_F32::~Inst_VINTRP__V_INTERP_MOV_F32()
-    {
-    } // ~Inst_VINTRP__V_INTERP_MOV_F32
-
-    // --- description from .arch file ---
-    // D.f = {P10,P20,P0}[S.u]; parameter load.
-    void
-    Inst_VINTRP__V_INTERP_MOV_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_CLASS_F32 class methods ---
-
-    Inst_VOP3__V_CMP_CLASS_F32::Inst_VOP3__V_CMP_CLASS_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_class_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_CLASS_F32
-
-    Inst_VOP3__V_CMP_CLASS_F32::~Inst_VOP3__V_CMP_CLASS_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_CLASS_F32
-
-    // --- description from .arch file ---
-    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f
-    // The function reports true if the floating point value is *any* of the
-    // ---  numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOP3__V_CMP_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_CLASS_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_CLASS_F32::Inst_VOP3__V_CMPX_CLASS_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_class_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_CLASS_F32
-
-    Inst_VOP3__V_CMPX_CLASS_F32::~Inst_VOP3__V_CMPX_CLASS_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_CLASS_F32
-
-    // --- description from .arch file ---
-    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
-    // S0.f
-    // The function reports true if the floating point value is *any* of the
-    // numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOP3__V_CMPX_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_CLASS_F64 class methods ---
-
-    Inst_VOP3__V_CMP_CLASS_F64::Inst_VOP3__V_CMP_CLASS_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_class_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_CLASS_F64
-
-    Inst_VOP3__V_CMP_CLASS_F64::~Inst_VOP3__V_CMP_CLASS_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_CLASS_F64
-
-    // --- description from .arch file ---
-    // VCC = IEEE numeric class function specified in S1.u, performed on S0.d
-    // The function reports true if the floating point value is *any* of the
-    // ---  numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOP3__V_CMP_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_CLASS_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_CLASS_F64::Inst_VOP3__V_CMPX_CLASS_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_class_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_CLASS_F64
-
-    Inst_VOP3__V_CMPX_CLASS_F64::~Inst_VOP3__V_CMPX_CLASS_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_CLASS_F64
-
-    // --- description from .arch file ---
-    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
-    // S0.d
-    // The function reports true if the floating point value is *any* of the
-    // numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOP3__V_CMPX_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_CLASS_F16 class methods ---
-
-    Inst_VOP3__V_CMP_CLASS_F16::Inst_VOP3__V_CMP_CLASS_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_class_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_CLASS_F16
-
-    Inst_VOP3__V_CMP_CLASS_F16::~Inst_VOP3__V_CMP_CLASS_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_CLASS_F16
-
-    // --- description from .arch file ---
-    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f16
-    // The function reports true if the floating point value is *any* of the
-    // ---  numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOP3__V_CMP_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_CLASS_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_CLASS_F16::Inst_VOP3__V_CMPX_CLASS_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_class_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_CLASS_F16
-
-    Inst_VOP3__V_CMPX_CLASS_F16::~Inst_VOP3__V_CMPX_CLASS_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_CLASS_F16
-
-    // --- description from .arch file ---
-    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
-    // ---  S0.f16
-    // The function reports true if the floating point value is *any* of the
-    // ---  numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOP3__V_CMPX_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_F_F16 class methods ---
-
-    Inst_VOP3__V_CMP_F_F16::Inst_VOP3__V_CMP_F_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_f_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_F_F16
-
-    Inst_VOP3__V_CMP_F_F16::~Inst_VOP3__V_CMP_F_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_F_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LT_F16 class methods ---
-
-    Inst_VOP3__V_CMP_LT_F16::Inst_VOP3__V_CMP_LT_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_LT_F16
-
-    Inst_VOP3__V_CMP_LT_F16::~Inst_VOP3__V_CMP_LT_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_EQ_F16 class methods ---
-
-    Inst_VOP3__V_CMP_EQ_F16::Inst_VOP3__V_CMP_EQ_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_eq_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_EQ_F16
-
-    Inst_VOP3__V_CMP_EQ_F16::~Inst_VOP3__V_CMP_EQ_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LE_F16 class methods ---
-
-    Inst_VOP3__V_CMP_LE_F16::Inst_VOP3__V_CMP_LE_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_le_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_LE_F16
-
-    Inst_VOP3__V_CMP_LE_F16::~Inst_VOP3__V_CMP_LE_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GT_F16 class methods ---
-
-    Inst_VOP3__V_CMP_GT_F16::Inst_VOP3__V_CMP_GT_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_gt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_GT_F16
-
-    Inst_VOP3__V_CMP_GT_F16::~Inst_VOP3__V_CMP_GT_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LG_F16 class methods ---
-
-    Inst_VOP3__V_CMP_LG_F16::Inst_VOP3__V_CMP_LG_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lg_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_LG_F16
-
-    Inst_VOP3__V_CMP_LG_F16::~Inst_VOP3__V_CMP_LG_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_LG_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GE_F16 class methods ---
-
-    Inst_VOP3__V_CMP_GE_F16::Inst_VOP3__V_CMP_GE_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ge_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_GE_F16
-
-    Inst_VOP3__V_CMP_GE_F16::~Inst_VOP3__V_CMP_GE_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_O_F16 class methods ---
-
-    Inst_VOP3__V_CMP_O_F16::Inst_VOP3__V_CMP_O_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_o_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_O_F16
-
-    Inst_VOP3__V_CMP_O_F16::~Inst_VOP3__V_CMP_O_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_O_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_O_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_U_F16 class methods ---
-
-    Inst_VOP3__V_CMP_U_F16::Inst_VOP3__V_CMP_U_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_u_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_U_F16
-
-    Inst_VOP3__V_CMP_U_F16::~Inst_VOP3__V_CMP_U_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_U_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_U_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NGE_F16 class methods ---
-
-    Inst_VOP3__V_CMP_NGE_F16::Inst_VOP3__V_CMP_NGE_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nge_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_NGE_F16
-
-    Inst_VOP3__V_CMP_NGE_F16::~Inst_VOP3__V_CMP_NGE_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_NGE_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NLG_F16 class methods ---
-
-    Inst_VOP3__V_CMP_NLG_F16::Inst_VOP3__V_CMP_NLG_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nlg_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_NLG_F16
-
-    Inst_VOP3__V_CMP_NLG_F16::~Inst_VOP3__V_CMP_NLG_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_NLG_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NGT_F16 class methods ---
-
-    Inst_VOP3__V_CMP_NGT_F16::Inst_VOP3__V_CMP_NGT_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ngt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_NGT_F16
-
-    Inst_VOP3__V_CMP_NGT_F16::~Inst_VOP3__V_CMP_NGT_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_NGT_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NLE_F16 class methods ---
-
-    Inst_VOP3__V_CMP_NLE_F16::Inst_VOP3__V_CMP_NLE_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nle_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_NLE_F16
-
-    Inst_VOP3__V_CMP_NLE_F16::~Inst_VOP3__V_CMP_NLE_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_NLE_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NEQ_F16 class methods ---
-
-    Inst_VOP3__V_CMP_NEQ_F16::Inst_VOP3__V_CMP_NEQ_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_neq_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_NEQ_F16
-
-    Inst_VOP3__V_CMP_NEQ_F16::~Inst_VOP3__V_CMP_NEQ_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_NEQ_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NLT_F16 class methods ---
-
-    Inst_VOP3__V_CMP_NLT_F16::Inst_VOP3__V_CMP_NLT_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nlt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_NLT_F16
-
-    Inst_VOP3__V_CMP_NLT_F16::~Inst_VOP3__V_CMP_NLT_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_NLT_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_TRU_F16 class methods ---
-
-    Inst_VOP3__V_CMP_TRU_F16::Inst_VOP3__V_CMP_TRU_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_tru_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_TRU_F16
-
-    Inst_VOP3__V_CMP_TRU_F16::~Inst_VOP3__V_CMP_TRU_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_TRU_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_F_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_F_F16::Inst_VOP3__V_CMPX_F_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_f_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_F_F16
-
-    Inst_VOP3__V_CMPX_F_F16::~Inst_VOP3__V_CMPX_F_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LT_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_LT_F16::Inst_VOP3__V_CMPX_LT_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LT_F16
-
-    Inst_VOP3__V_CMPX_LT_F16::~Inst_VOP3__V_CMPX_LT_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_EQ_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_EQ_F16::Inst_VOP3__V_CMPX_EQ_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_eq_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_EQ_F16
-
-    Inst_VOP3__V_CMPX_EQ_F16::~Inst_VOP3__V_CMPX_EQ_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LE_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_LE_F16::Inst_VOP3__V_CMPX_LE_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_le_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LE_F16
-
-    Inst_VOP3__V_CMPX_LE_F16::~Inst_VOP3__V_CMPX_LE_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GT_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_GT_F16::Inst_VOP3__V_CMPX_GT_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_gt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GT_F16
-
-    Inst_VOP3__V_CMPX_GT_F16::~Inst_VOP3__V_CMPX_GT_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LG_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_LG_F16::Inst_VOP3__V_CMPX_LG_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lg_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LG_F16
-
-    Inst_VOP3__V_CMPX_LG_F16::~Inst_VOP3__V_CMPX_LG_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_LG_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GE_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_GE_F16::Inst_VOP3__V_CMPX_GE_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ge_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GE_F16
-
-    Inst_VOP3__V_CMPX_GE_F16::~Inst_VOP3__V_CMPX_GE_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_O_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_O_F16::Inst_VOP3__V_CMPX_O_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_o_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_O_F16
-
-    Inst_VOP3__V_CMPX_O_F16::~Inst_VOP3__V_CMPX_O_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_O_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOP3__V_CMPX_O_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_U_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_U_F16::Inst_VOP3__V_CMPX_U_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_u_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_U_F16
-
-    Inst_VOP3__V_CMPX_U_F16::~Inst_VOP3__V_CMPX_U_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_U_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOP3__V_CMPX_U_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NGE_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_NGE_F16::Inst_VOP3__V_CMPX_NGE_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nge_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NGE_F16
-
-    Inst_VOP3__V_CMPX_NGE_F16::~Inst_VOP3__V_CMPX_NGE_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NGE_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NLG_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_NLG_F16::Inst_VOP3__V_CMPX_NLG_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nlg_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NLG_F16
-
-    Inst_VOP3__V_CMPX_NLG_F16::~Inst_VOP3__V_CMPX_NLG_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLG_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NGT_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_NGT_F16::Inst_VOP3__V_CMPX_NGT_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ngt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NGT_F16
-
-    Inst_VOP3__V_CMPX_NGT_F16::~Inst_VOP3__V_CMPX_NGT_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NGT_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NLE_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_NLE_F16::Inst_VOP3__V_CMPX_NLE_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nle_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NLE_F16
-
-    Inst_VOP3__V_CMPX_NLE_F16::~Inst_VOP3__V_CMPX_NLE_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLE_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NEQ_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_NEQ_F16::Inst_VOP3__V_CMPX_NEQ_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_neq_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NEQ_F16
-
-    Inst_VOP3__V_CMPX_NEQ_F16::~Inst_VOP3__V_CMPX_NEQ_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NEQ_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NLT_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_NLT_F16::Inst_VOP3__V_CMPX_NLT_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nlt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NLT_F16
-
-    Inst_VOP3__V_CMPX_NLT_F16::~Inst_VOP3__V_CMPX_NLT_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLT_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_TRU_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_TRU_F16::Inst_VOP3__V_CMPX_TRU_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_tru_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_TRU_F16
-
-    Inst_VOP3__V_CMPX_TRU_F16::~Inst_VOP3__V_CMPX_TRU_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_TRU_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_F_F32 class methods ---
-
-    Inst_VOP3__V_CMP_F_F32::Inst_VOP3__V_CMP_F_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_f_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_F_F32
-
-    Inst_VOP3__V_CMP_F_F32::~Inst_VOP3__V_CMP_F_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_F_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LT_F32 class methods ---
-
-    Inst_VOP3__V_CMP_LT_F32::Inst_VOP3__V_CMP_LT_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_LT_F32
-
-    Inst_VOP3__V_CMP_LT_F32::~Inst_VOP3__V_CMP_LT_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_EQ_F32 class methods ---
-
-    Inst_VOP3__V_CMP_EQ_F32::Inst_VOP3__V_CMP_EQ_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_eq_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_EQ_F32
-
-    Inst_VOP3__V_CMP_EQ_F32::~Inst_VOP3__V_CMP_EQ_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LE_F32 class methods ---
-
-    Inst_VOP3__V_CMP_LE_F32::Inst_VOP3__V_CMP_LE_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_le_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_LE_F32
-
-    Inst_VOP3__V_CMP_LE_F32::~Inst_VOP3__V_CMP_LE_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GT_F32 class methods ---
-
-    Inst_VOP3__V_CMP_GT_F32::Inst_VOP3__V_CMP_GT_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_gt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_GT_F32
-
-    Inst_VOP3__V_CMP_GT_F32::~Inst_VOP3__V_CMP_GT_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LG_F32 class methods ---
-
-    Inst_VOP3__V_CMP_LG_F32::Inst_VOP3__V_CMP_LG_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lg_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_LG_F32
-
-    Inst_VOP3__V_CMP_LG_F32::~Inst_VOP3__V_CMP_LG_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_LG_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GE_F32 class methods ---
-
-    Inst_VOP3__V_CMP_GE_F32::Inst_VOP3__V_CMP_GE_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ge_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_GE_F32
-
-    Inst_VOP3__V_CMP_GE_F32::~Inst_VOP3__V_CMP_GE_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_O_F32 class methods ---
-
-    Inst_VOP3__V_CMP_O_F32::Inst_VOP3__V_CMP_O_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_o_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_O_F32
-
-    Inst_VOP3__V_CMP_O_F32::~Inst_VOP3__V_CMP_O_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_O_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_O_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_U_F32 class methods ---
-
-    Inst_VOP3__V_CMP_U_F32::Inst_VOP3__V_CMP_U_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_u_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_U_F32
-
-    Inst_VOP3__V_CMP_U_F32::~Inst_VOP3__V_CMP_U_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_U_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_U_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (std::isnan(src0[lane])
-                    || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NGE_F32 class methods ---
-
-    Inst_VOP3__V_CMP_NGE_F32::Inst_VOP3__V_CMP_NGE_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nge_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_NGE_F32
-
-    Inst_VOP3__V_CMP_NGE_F32::~Inst_VOP3__V_CMP_NGE_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_NGE_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NLG_F32 class methods ---
-
-    Inst_VOP3__V_CMP_NLG_F32::Inst_VOP3__V_CMP_NLG_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nlg_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_NLG_F32
-
-    Inst_VOP3__V_CMP_NLG_F32::~Inst_VOP3__V_CMP_NLG_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_NLG_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NGT_F32 class methods ---
-
-    Inst_VOP3__V_CMP_NGT_F32::Inst_VOP3__V_CMP_NGT_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ngt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_NGT_F32
-
-    Inst_VOP3__V_CMP_NGT_F32::~Inst_VOP3__V_CMP_NGT_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_NGT_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NLE_F32 class methods ---
-
-    Inst_VOP3__V_CMP_NLE_F32::Inst_VOP3__V_CMP_NLE_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nle_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_NLE_F32
-
-    Inst_VOP3__V_CMP_NLE_F32::~Inst_VOP3__V_CMP_NLE_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_NLE_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NEQ_F32 class methods ---
-
-    Inst_VOP3__V_CMP_NEQ_F32::Inst_VOP3__V_CMP_NEQ_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_neq_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_NEQ_F32
-
-    Inst_VOP3__V_CMP_NEQ_F32::~Inst_VOP3__V_CMP_NEQ_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_NEQ_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NLT_F32 class methods ---
-
-    Inst_VOP3__V_CMP_NLT_F32::Inst_VOP3__V_CMP_NLT_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nlt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_NLT_F32
-
-    Inst_VOP3__V_CMP_NLT_F32::~Inst_VOP3__V_CMP_NLT_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_NLT_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_TRU_F32 class methods ---
-
-    Inst_VOP3__V_CMP_TRU_F32::Inst_VOP3__V_CMP_TRU_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_tru_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_TRU_F32
-
-    Inst_VOP3__V_CMP_TRU_F32::~Inst_VOP3__V_CMP_TRU_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_TRU_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_F_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_F_F32::Inst_VOP3__V_CMPX_F_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_f_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_F_F32
-
-    Inst_VOP3__V_CMPX_F_F32::~Inst_VOP3__V_CMPX_F_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LT_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_LT_F32::Inst_VOP3__V_CMPX_LT_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LT_F32
-
-    Inst_VOP3__V_CMPX_LT_F32::~Inst_VOP3__V_CMPX_LT_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_EQ_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_EQ_F32::Inst_VOP3__V_CMPX_EQ_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_eq_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_EQ_F32
-
-    Inst_VOP3__V_CMPX_EQ_F32::~Inst_VOP3__V_CMPX_EQ_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LE_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_LE_F32::Inst_VOP3__V_CMPX_LE_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_le_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LE_F32
-
-    Inst_VOP3__V_CMPX_LE_F32::~Inst_VOP3__V_CMPX_LE_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GT_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_GT_F32::Inst_VOP3__V_CMPX_GT_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_gt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GT_F32
-
-    Inst_VOP3__V_CMPX_GT_F32::~Inst_VOP3__V_CMPX_GT_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LG_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_LG_F32::Inst_VOP3__V_CMPX_LG_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lg_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LG_F32
-
-    Inst_VOP3__V_CMPX_LG_F32::~Inst_VOP3__V_CMPX_LG_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_LG_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GE_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_GE_F32::Inst_VOP3__V_CMPX_GE_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ge_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GE_F32
-
-    Inst_VOP3__V_CMPX_GE_F32::~Inst_VOP3__V_CMPX_GE_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_O_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_O_F32::Inst_VOP3__V_CMPX_O_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_o_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_O_F32
-
-    Inst_VOP3__V_CMPX_O_F32::~Inst_VOP3__V_CMPX_O_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_O_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOP3__V_CMPX_O_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_U_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_U_F32::Inst_VOP3__V_CMPX_U_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_u_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_U_F32
-
-    Inst_VOP3__V_CMPX_U_F32::~Inst_VOP3__V_CMPX_U_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_U_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOP3__V_CMPX_U_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (std::isnan(src0[lane])
-                        || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NGE_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_NGE_F32::Inst_VOP3__V_CMPX_NGE_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nge_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NGE_F32
-
-    Inst_VOP3__V_CMPX_NGE_F32::~Inst_VOP3__V_CMPX_NGE_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NGE_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NLG_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_NLG_F32::Inst_VOP3__V_CMPX_NLG_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nlg_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NLG_F32
-
-    Inst_VOP3__V_CMPX_NLG_F32::~Inst_VOP3__V_CMPX_NLG_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLG_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NGT_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_NGT_F32::Inst_VOP3__V_CMPX_NGT_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ngt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NGT_F32
-
-    Inst_VOP3__V_CMPX_NGT_F32::~Inst_VOP3__V_CMPX_NGT_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NGT_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NLE_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_NLE_F32::Inst_VOP3__V_CMPX_NLE_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nle_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NLE_F32
-
-    Inst_VOP3__V_CMPX_NLE_F32::~Inst_VOP3__V_CMPX_NLE_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLE_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NEQ_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_NEQ_F32::Inst_VOP3__V_CMPX_NEQ_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_neq_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NEQ_F32
-
-    Inst_VOP3__V_CMPX_NEQ_F32::~Inst_VOP3__V_CMPX_NEQ_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NEQ_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NLT_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_NLT_F32::Inst_VOP3__V_CMPX_NLT_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nlt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NLT_F32
-
-    Inst_VOP3__V_CMPX_NLT_F32::~Inst_VOP3__V_CMPX_NLT_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLT_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_TRU_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_TRU_F32::Inst_VOP3__V_CMPX_TRU_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_tru_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_TRU_F32
-
-    Inst_VOP3__V_CMPX_TRU_F32::~Inst_VOP3__V_CMPX_TRU_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_TRU_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_F_F64 class methods ---
-
-    Inst_VOP3__V_CMP_F_F64::Inst_VOP3__V_CMP_F_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_f_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_F_F64
-
-    Inst_VOP3__V_CMP_F_F64::~Inst_VOP3__V_CMP_F_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_F_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LT_F64 class methods ---
-
-    Inst_VOP3__V_CMP_LT_F64::Inst_VOP3__V_CMP_LT_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_LT_F64
-
-    Inst_VOP3__V_CMP_LT_F64::~Inst_VOP3__V_CMP_LT_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_EQ_F64 class methods ---
-
-    Inst_VOP3__V_CMP_EQ_F64::Inst_VOP3__V_CMP_EQ_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_eq_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_EQ_F64
-
-    Inst_VOP3__V_CMP_EQ_F64::~Inst_VOP3__V_CMP_EQ_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LE_F64 class methods ---
-
-    Inst_VOP3__V_CMP_LE_F64::Inst_VOP3__V_CMP_LE_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_le_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_LE_F64
-
-    Inst_VOP3__V_CMP_LE_F64::~Inst_VOP3__V_CMP_LE_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GT_F64 class methods ---
-
-    Inst_VOP3__V_CMP_GT_F64::Inst_VOP3__V_CMP_GT_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_gt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_GT_F64
-
-    Inst_VOP3__V_CMP_GT_F64::~Inst_VOP3__V_CMP_GT_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LG_F64 class methods ---
-
-    Inst_VOP3__V_CMP_LG_F64::Inst_VOP3__V_CMP_LG_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lg_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_LG_F64
-
-    Inst_VOP3__V_CMP_LG_F64::~Inst_VOP3__V_CMP_LG_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_LG_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GE_F64 class methods ---
-
-    Inst_VOP3__V_CMP_GE_F64::Inst_VOP3__V_CMP_GE_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ge_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_GE_F64
-
-    Inst_VOP3__V_CMP_GE_F64::~Inst_VOP3__V_CMP_GE_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_O_F64 class methods ---
-
-    Inst_VOP3__V_CMP_O_F64::Inst_VOP3__V_CMP_O_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_o_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_O_F64
-
-    Inst_VOP3__V_CMP_O_F64::~Inst_VOP3__V_CMP_O_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_O_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_O_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_U_F64 class methods ---
-
-    Inst_VOP3__V_CMP_U_F64::Inst_VOP3__V_CMP_U_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_u_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_U_F64
-
-    Inst_VOP3__V_CMP_U_F64::~Inst_VOP3__V_CMP_U_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_U_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_U_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (std::isnan(src0[lane])
-                    || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NGE_F64 class methods ---
-
-    Inst_VOP3__V_CMP_NGE_F64::Inst_VOP3__V_CMP_NGE_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nge_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_NGE_F64
-
-    Inst_VOP3__V_CMP_NGE_F64::~Inst_VOP3__V_CMP_NGE_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_NGE_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NLG_F64 class methods ---
-
-    Inst_VOP3__V_CMP_NLG_F64::Inst_VOP3__V_CMP_NLG_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nlg_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_NLG_F64
-
-    Inst_VOP3__V_CMP_NLG_F64::~Inst_VOP3__V_CMP_NLG_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_NLG_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NGT_F64 class methods ---
-
-    Inst_VOP3__V_CMP_NGT_F64::Inst_VOP3__V_CMP_NGT_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ngt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_NGT_F64
-
-    Inst_VOP3__V_CMP_NGT_F64::~Inst_VOP3__V_CMP_NGT_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_NGT_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NLE_F64 class methods ---
-
-    Inst_VOP3__V_CMP_NLE_F64::Inst_VOP3__V_CMP_NLE_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nle_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_NLE_F64
-
-    Inst_VOP3__V_CMP_NLE_F64::~Inst_VOP3__V_CMP_NLE_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_NLE_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NEQ_F64 class methods ---
-
-    Inst_VOP3__V_CMP_NEQ_F64::Inst_VOP3__V_CMP_NEQ_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_neq_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_NEQ_F64
-
-    Inst_VOP3__V_CMP_NEQ_F64::~Inst_VOP3__V_CMP_NEQ_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_NEQ_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NLT_F64 class methods ---
-
-    Inst_VOP3__V_CMP_NLT_F64::Inst_VOP3__V_CMP_NLT_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nlt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_NLT_F64
-
-    Inst_VOP3__V_CMP_NLT_F64::~Inst_VOP3__V_CMP_NLT_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_NLT_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_TRU_F64 class methods ---
-
-    Inst_VOP3__V_CMP_TRU_F64::Inst_VOP3__V_CMP_TRU_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_tru_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_TRU_F64
-
-    Inst_VOP3__V_CMP_TRU_F64::~Inst_VOP3__V_CMP_TRU_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_TRU_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_F_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_F_F64::Inst_VOP3__V_CMPX_F_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_f_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_F_F64
-
-    Inst_VOP3__V_CMPX_F_F64::~Inst_VOP3__V_CMPX_F_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LT_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_LT_F64::Inst_VOP3__V_CMPX_LT_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LT_F64
-
-    Inst_VOP3__V_CMPX_LT_F64::~Inst_VOP3__V_CMPX_LT_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_EQ_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_EQ_F64::Inst_VOP3__V_CMPX_EQ_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_eq_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_EQ_F64
-
-    Inst_VOP3__V_CMPX_EQ_F64::~Inst_VOP3__V_CMPX_EQ_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LE_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_LE_F64::Inst_VOP3__V_CMPX_LE_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_le_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LE_F64
-
-    Inst_VOP3__V_CMPX_LE_F64::~Inst_VOP3__V_CMPX_LE_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GT_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_GT_F64::Inst_VOP3__V_CMPX_GT_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_gt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GT_F64
-
-    Inst_VOP3__V_CMPX_GT_F64::~Inst_VOP3__V_CMPX_GT_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LG_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_LG_F64::Inst_VOP3__V_CMPX_LG_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lg_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LG_F64
-
-    Inst_VOP3__V_CMPX_LG_F64::~Inst_VOP3__V_CMPX_LG_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_LG_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GE_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_GE_F64::Inst_VOP3__V_CMPX_GE_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ge_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GE_F64
-
-    Inst_VOP3__V_CMPX_GE_F64::~Inst_VOP3__V_CMPX_GE_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_O_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_O_F64::Inst_VOP3__V_CMPX_O_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_o_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_O_F64
-
-    Inst_VOP3__V_CMPX_O_F64::~Inst_VOP3__V_CMPX_O_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_O_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOP3__V_CMPX_O_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_U_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_U_F64::Inst_VOP3__V_CMPX_U_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_u_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_U_F64
-
-    Inst_VOP3__V_CMPX_U_F64::~Inst_VOP3__V_CMPX_U_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_U_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOP3__V_CMPX_U_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (std::isnan(src0[lane])
-                    || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NGE_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_NGE_F64::Inst_VOP3__V_CMPX_NGE_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nge_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NGE_F64
-
-    Inst_VOP3__V_CMPX_NGE_F64::~Inst_VOP3__V_CMPX_NGE_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NGE_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NLG_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_NLG_F64::Inst_VOP3__V_CMPX_NLG_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nlg_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NLG_F64
-
-    Inst_VOP3__V_CMPX_NLG_F64::~Inst_VOP3__V_CMPX_NLG_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLG_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NGT_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_NGT_F64::Inst_VOP3__V_CMPX_NGT_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ngt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NGT_F64
-
-    Inst_VOP3__V_CMPX_NGT_F64::~Inst_VOP3__V_CMPX_NGT_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NGT_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NLE_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_NLE_F64::Inst_VOP3__V_CMPX_NLE_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nle_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NLE_F64
-
-    Inst_VOP3__V_CMPX_NLE_F64::~Inst_VOP3__V_CMPX_NLE_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLE_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NEQ_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_NEQ_F64::Inst_VOP3__V_CMPX_NEQ_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_neq_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NEQ_F64
-
-    Inst_VOP3__V_CMPX_NEQ_F64::~Inst_VOP3__V_CMPX_NEQ_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NEQ_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NLT_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_NLT_F64::Inst_VOP3__V_CMPX_NLT_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nlt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NLT_F64
-
-    Inst_VOP3__V_CMPX_NLT_F64::~Inst_VOP3__V_CMPX_NLT_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLT_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_TRU_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_TRU_F64::Inst_VOP3__V_CMPX_TRU_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_tru_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_TRU_F64
-
-    Inst_VOP3__V_CMPX_TRU_F64::~Inst_VOP3__V_CMPX_TRU_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_TRU_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_F_I16 class methods ---
-
-    Inst_VOP3__V_CMP_F_I16::Inst_VOP3__V_CMP_F_I16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_f_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_F_I16
-
-    Inst_VOP3__V_CMP_F_I16::~Inst_VOP3__V_CMP_F_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_F_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LT_I16 class methods ---
-
-    Inst_VOP3__V_CMP_LT_I16::Inst_VOP3__V_CMP_LT_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lt_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LT_I16
-
-    Inst_VOP3__V_CMP_LT_I16::~Inst_VOP3__V_CMP_LT_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_EQ_I16 class methods ---
-
-    Inst_VOP3__V_CMP_EQ_I16::Inst_VOP3__V_CMP_EQ_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_eq_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_EQ_I16
-
-    Inst_VOP3__V_CMP_EQ_I16::~Inst_VOP3__V_CMP_EQ_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LE_I16 class methods ---
-
-    Inst_VOP3__V_CMP_LE_I16::Inst_VOP3__V_CMP_LE_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_le_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LE_I16
-
-    Inst_VOP3__V_CMP_LE_I16::~Inst_VOP3__V_CMP_LE_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GT_I16 class methods ---
-
-    Inst_VOP3__V_CMP_GT_I16::Inst_VOP3__V_CMP_GT_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_gt_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GT_I16
-
-    Inst_VOP3__V_CMP_GT_I16::~Inst_VOP3__V_CMP_GT_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NE_I16 class methods ---
-
-    Inst_VOP3__V_CMP_NE_I16::Inst_VOP3__V_CMP_NE_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ne_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_NE_I16
-
-    Inst_VOP3__V_CMP_NE_I16::~Inst_VOP3__V_CMP_NE_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_NE_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GE_I16 class methods ---
-
-    Inst_VOP3__V_CMP_GE_I16::Inst_VOP3__V_CMP_GE_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ge_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GE_I16
-
-    Inst_VOP3__V_CMP_GE_I16::~Inst_VOP3__V_CMP_GE_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_T_I16 class methods ---
-
-    Inst_VOP3__V_CMP_T_I16::Inst_VOP3__V_CMP_T_I16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_t_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_T_I16
-
-    Inst_VOP3__V_CMP_T_I16::~Inst_VOP3__V_CMP_T_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_T_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_T_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_F_U16 class methods ---
-
-    Inst_VOP3__V_CMP_F_U16::Inst_VOP3__V_CMP_F_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_f_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_F_U16
-
-    Inst_VOP3__V_CMP_F_U16::~Inst_VOP3__V_CMP_F_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_F_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LT_U16 class methods ---
-
-    Inst_VOP3__V_CMP_LT_U16::Inst_VOP3__V_CMP_LT_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lt_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LT_U16
-
-    Inst_VOP3__V_CMP_LT_U16::~Inst_VOP3__V_CMP_LT_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_EQ_U16 class methods ---
-
-    Inst_VOP3__V_CMP_EQ_U16::Inst_VOP3__V_CMP_EQ_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_eq_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_EQ_U16
-
-    Inst_VOP3__V_CMP_EQ_U16::~Inst_VOP3__V_CMP_EQ_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LE_U16 class methods ---
-
-    Inst_VOP3__V_CMP_LE_U16::Inst_VOP3__V_CMP_LE_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_le_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LE_U16
-
-    Inst_VOP3__V_CMP_LE_U16::~Inst_VOP3__V_CMP_LE_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GT_U16 class methods ---
-
-    Inst_VOP3__V_CMP_GT_U16::Inst_VOP3__V_CMP_GT_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_gt_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GT_U16
-
-    Inst_VOP3__V_CMP_GT_U16::~Inst_VOP3__V_CMP_GT_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NE_U16 class methods ---
-
-    Inst_VOP3__V_CMP_NE_U16::Inst_VOP3__V_CMP_NE_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ne_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_NE_U16
-
-    Inst_VOP3__V_CMP_NE_U16::~Inst_VOP3__V_CMP_NE_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_NE_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GE_U16 class methods ---
-
-    Inst_VOP3__V_CMP_GE_U16::Inst_VOP3__V_CMP_GE_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ge_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GE_U16
-
-    Inst_VOP3__V_CMP_GE_U16::~Inst_VOP3__V_CMP_GE_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_T_U16 class methods ---
-
-    Inst_VOP3__V_CMP_T_U16::Inst_VOP3__V_CMP_T_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_t_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_T_U16
-
-    Inst_VOP3__V_CMP_T_U16::~Inst_VOP3__V_CMP_T_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_T_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_T_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_F_I16 class methods ---
-
-    Inst_VOP3__V_CMPX_F_I16::Inst_VOP3__V_CMPX_F_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_f_i16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_F_I16
-
-    Inst_VOP3__V_CMPX_F_I16::~Inst_VOP3__V_CMPX_F_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LT_I16 class methods ---
-
-    Inst_VOP3__V_CMPX_LT_I16::Inst_VOP3__V_CMPX_LT_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lt_i16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LT_I16
-
-    Inst_VOP3__V_CMPX_LT_I16::~Inst_VOP3__V_CMPX_LT_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_EQ_I16 class methods ---
-
-    Inst_VOP3__V_CMPX_EQ_I16::Inst_VOP3__V_CMPX_EQ_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_eq_i16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_EQ_I16
-
-    Inst_VOP3__V_CMPX_EQ_I16::~Inst_VOP3__V_CMPX_EQ_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LE_I16 class methods ---
-
-    Inst_VOP3__V_CMPX_LE_I16::Inst_VOP3__V_CMPX_LE_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_le_i16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LE_I16
-
-    Inst_VOP3__V_CMPX_LE_I16::~Inst_VOP3__V_CMPX_LE_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GT_I16 class methods ---
-
-    Inst_VOP3__V_CMPX_GT_I16::Inst_VOP3__V_CMPX_GT_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_gt_i16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GT_I16
-
-    Inst_VOP3__V_CMPX_GT_I16::~Inst_VOP3__V_CMPX_GT_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NE_I16 class methods ---
-
-    Inst_VOP3__V_CMPX_NE_I16::Inst_VOP3__V_CMPX_NE_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ne_i16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NE_I16
-
-    Inst_VOP3__V_CMPX_NE_I16::~Inst_VOP3__V_CMPX_NE_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NE_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GE_I16 class methods ---
-
-    Inst_VOP3__V_CMPX_GE_I16::Inst_VOP3__V_CMPX_GE_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ge_i16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GE_I16
-
-    Inst_VOP3__V_CMPX_GE_I16::~Inst_VOP3__V_CMPX_GE_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_T_I16 class methods ---
-
-    Inst_VOP3__V_CMPX_T_I16::Inst_VOP3__V_CMPX_T_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_t_i16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_T_I16
-
-    Inst_VOP3__V_CMPX_T_I16::~Inst_VOP3__V_CMPX_T_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_T_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_T_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_F_U16 class methods ---
-
-    Inst_VOP3__V_CMPX_F_U16::Inst_VOP3__V_CMPX_F_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_f_u16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_F_U16
-
-    Inst_VOP3__V_CMPX_F_U16::~Inst_VOP3__V_CMPX_F_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LT_U16 class methods ---
-
-    Inst_VOP3__V_CMPX_LT_U16::Inst_VOP3__V_CMPX_LT_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lt_u16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LT_U16
-
-    Inst_VOP3__V_CMPX_LT_U16::~Inst_VOP3__V_CMPX_LT_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_EQ_U16 class methods ---
-
-    Inst_VOP3__V_CMPX_EQ_U16::Inst_VOP3__V_CMPX_EQ_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_eq_u16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_EQ_U16
-
-    Inst_VOP3__V_CMPX_EQ_U16::~Inst_VOP3__V_CMPX_EQ_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LE_U16 class methods ---
-
-    Inst_VOP3__V_CMPX_LE_U16::Inst_VOP3__V_CMPX_LE_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_le_u16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LE_U16
-
-    Inst_VOP3__V_CMPX_LE_U16::~Inst_VOP3__V_CMPX_LE_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GT_U16 class methods ---
-
-    Inst_VOP3__V_CMPX_GT_U16::Inst_VOP3__V_CMPX_GT_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_gt_u16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GT_U16
-
-    Inst_VOP3__V_CMPX_GT_U16::~Inst_VOP3__V_CMPX_GT_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NE_U16 class methods ---
-
-    Inst_VOP3__V_CMPX_NE_U16::Inst_VOP3__V_CMPX_NE_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ne_u16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NE_U16
-
-    Inst_VOP3__V_CMPX_NE_U16::~Inst_VOP3__V_CMPX_NE_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NE_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GE_U16 class methods ---
-
-    Inst_VOP3__V_CMPX_GE_U16::Inst_VOP3__V_CMPX_GE_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ge_u16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GE_U16
-
-    Inst_VOP3__V_CMPX_GE_U16::~Inst_VOP3__V_CMPX_GE_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_T_U16 class methods ---
-
-    Inst_VOP3__V_CMPX_T_U16::Inst_VOP3__V_CMPX_T_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_t_u16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_T_U16
-
-    Inst_VOP3__V_CMPX_T_U16::~Inst_VOP3__V_CMPX_T_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_T_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_T_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_F_I32 class methods ---
-
-    Inst_VOP3__V_CMP_F_I32::Inst_VOP3__V_CMP_F_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_f_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_F_I32
-
-    Inst_VOP3__V_CMP_F_I32::~Inst_VOP3__V_CMP_F_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_F_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LT_I32 class methods ---
-
-    Inst_VOP3__V_CMP_LT_I32::Inst_VOP3__V_CMP_LT_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lt_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LT_I32
-
-    Inst_VOP3__V_CMP_LT_I32::~Inst_VOP3__V_CMP_LT_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_EQ_I32 class methods ---
-
-    Inst_VOP3__V_CMP_EQ_I32::Inst_VOP3__V_CMP_EQ_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_eq_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_EQ_I32
-
-    Inst_VOP3__V_CMP_EQ_I32::~Inst_VOP3__V_CMP_EQ_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LE_I32 class methods ---
-
-    Inst_VOP3__V_CMP_LE_I32::Inst_VOP3__V_CMP_LE_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_le_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LE_I32
-
-    Inst_VOP3__V_CMP_LE_I32::~Inst_VOP3__V_CMP_LE_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GT_I32 class methods ---
-
-    Inst_VOP3__V_CMP_GT_I32::Inst_VOP3__V_CMP_GT_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_gt_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GT_I32
-
-    Inst_VOP3__V_CMP_GT_I32::~Inst_VOP3__V_CMP_GT_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NE_I32 class methods ---
-
-    Inst_VOP3__V_CMP_NE_I32::Inst_VOP3__V_CMP_NE_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ne_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_NE_I32
-
-    Inst_VOP3__V_CMP_NE_I32::~Inst_VOP3__V_CMP_NE_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_NE_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GE_I32 class methods ---
-
-    Inst_VOP3__V_CMP_GE_I32::Inst_VOP3__V_CMP_GE_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ge_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GE_I32
-
-    Inst_VOP3__V_CMP_GE_I32::~Inst_VOP3__V_CMP_GE_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_T_I32 class methods ---
-
-    Inst_VOP3__V_CMP_T_I32::Inst_VOP3__V_CMP_T_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_t_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_T_I32
-
-    Inst_VOP3__V_CMP_T_I32::~Inst_VOP3__V_CMP_T_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_T_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_T_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_F_U32 class methods ---
-
-    Inst_VOP3__V_CMP_F_U32::Inst_VOP3__V_CMP_F_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_f_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_F_U32
-
-    Inst_VOP3__V_CMP_F_U32::~Inst_VOP3__V_CMP_F_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_F_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LT_U32 class methods ---
-
-    Inst_VOP3__V_CMP_LT_U32::Inst_VOP3__V_CMP_LT_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lt_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LT_U32
-
-    Inst_VOP3__V_CMP_LT_U32::~Inst_VOP3__V_CMP_LT_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_EQ_U32 class methods ---
-
-    Inst_VOP3__V_CMP_EQ_U32::Inst_VOP3__V_CMP_EQ_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_eq_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_EQ_U32
-
-    Inst_VOP3__V_CMP_EQ_U32::~Inst_VOP3__V_CMP_EQ_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LE_U32 class methods ---
-
-    Inst_VOP3__V_CMP_LE_U32::Inst_VOP3__V_CMP_LE_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_le_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LE_U32
-
-    Inst_VOP3__V_CMP_LE_U32::~Inst_VOP3__V_CMP_LE_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GT_U32 class methods ---
-
-    Inst_VOP3__V_CMP_GT_U32::Inst_VOP3__V_CMP_GT_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_gt_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GT_U32
-
-    Inst_VOP3__V_CMP_GT_U32::~Inst_VOP3__V_CMP_GT_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NE_U32 class methods ---
-
-    Inst_VOP3__V_CMP_NE_U32::Inst_VOP3__V_CMP_NE_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ne_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_NE_U32
-
-    Inst_VOP3__V_CMP_NE_U32::~Inst_VOP3__V_CMP_NE_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_NE_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GE_U32 class methods ---
-
-    Inst_VOP3__V_CMP_GE_U32::Inst_VOP3__V_CMP_GE_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ge_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GE_U32
-
-    Inst_VOP3__V_CMP_GE_U32::~Inst_VOP3__V_CMP_GE_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_T_U32 class methods ---
-
-    Inst_VOP3__V_CMP_T_U32::Inst_VOP3__V_CMP_T_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_t_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_T_U32
-
-    Inst_VOP3__V_CMP_T_U32::~Inst_VOP3__V_CMP_T_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_T_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_T_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_F_I32 class methods ---
-
-    Inst_VOP3__V_CMPX_F_I32::Inst_VOP3__V_CMPX_F_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_f_i32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_F_I32
-
-    Inst_VOP3__V_CMPX_F_I32::~Inst_VOP3__V_CMPX_F_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LT_I32 class methods ---
-
-    Inst_VOP3__V_CMPX_LT_I32::Inst_VOP3__V_CMPX_LT_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lt_i32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LT_I32
-
-    Inst_VOP3__V_CMPX_LT_I32::~Inst_VOP3__V_CMPX_LT_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_EQ_I32 class methods ---
-
-    Inst_VOP3__V_CMPX_EQ_I32::Inst_VOP3__V_CMPX_EQ_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_eq_i32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_EQ_I32
-
-    Inst_VOP3__V_CMPX_EQ_I32::~Inst_VOP3__V_CMPX_EQ_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LE_I32 class methods ---
-
-    Inst_VOP3__V_CMPX_LE_I32::Inst_VOP3__V_CMPX_LE_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_le_i32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LE_I32
-
-    Inst_VOP3__V_CMPX_LE_I32::~Inst_VOP3__V_CMPX_LE_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GT_I32 class methods ---
-
-    Inst_VOP3__V_CMPX_GT_I32::Inst_VOP3__V_CMPX_GT_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_gt_i32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GT_I32
-
-    Inst_VOP3__V_CMPX_GT_I32::~Inst_VOP3__V_CMPX_GT_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NE_I32 class methods ---
-
-    Inst_VOP3__V_CMPX_NE_I32::Inst_VOP3__V_CMPX_NE_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ne_i32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NE_I32
-
-    Inst_VOP3__V_CMPX_NE_I32::~Inst_VOP3__V_CMPX_NE_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NE_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GE_I32 class methods ---
-
-    Inst_VOP3__V_CMPX_GE_I32::Inst_VOP3__V_CMPX_GE_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ge_i32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GE_I32
-
-    Inst_VOP3__V_CMPX_GE_I32::~Inst_VOP3__V_CMPX_GE_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_T_I32 class methods ---
-
-    Inst_VOP3__V_CMPX_T_I32::Inst_VOP3__V_CMPX_T_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_t_i32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_T_I32
-
-    Inst_VOP3__V_CMPX_T_I32::~Inst_VOP3__V_CMPX_T_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_T_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_T_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_F_U32 class methods ---
-
-    Inst_VOP3__V_CMPX_F_U32::Inst_VOP3__V_CMPX_F_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_f_u32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_F_U32
-
-    Inst_VOP3__V_CMPX_F_U32::~Inst_VOP3__V_CMPX_F_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LT_U32 class methods ---
-
-    Inst_VOP3__V_CMPX_LT_U32::Inst_VOP3__V_CMPX_LT_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lt_u32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LT_U32
-
-    Inst_VOP3__V_CMPX_LT_U32::~Inst_VOP3__V_CMPX_LT_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_EQ_U32 class methods ---
-
-    Inst_VOP3__V_CMPX_EQ_U32::Inst_VOP3__V_CMPX_EQ_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_eq_u32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_EQ_U32
-
-    Inst_VOP3__V_CMPX_EQ_U32::~Inst_VOP3__V_CMPX_EQ_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LE_U32 class methods ---
-
-    Inst_VOP3__V_CMPX_LE_U32::Inst_VOP3__V_CMPX_LE_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_le_u32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LE_U32
-
-    Inst_VOP3__V_CMPX_LE_U32::~Inst_VOP3__V_CMPX_LE_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GT_U32 class methods ---
-
-    Inst_VOP3__V_CMPX_GT_U32::Inst_VOP3__V_CMPX_GT_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_gt_u32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GT_U32
-
-    Inst_VOP3__V_CMPX_GT_U32::~Inst_VOP3__V_CMPX_GT_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NE_U32 class methods ---
-
-    Inst_VOP3__V_CMPX_NE_U32::Inst_VOP3__V_CMPX_NE_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ne_u32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NE_U32
-
-    Inst_VOP3__V_CMPX_NE_U32::~Inst_VOP3__V_CMPX_NE_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NE_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GE_U32 class methods ---
-
-    Inst_VOP3__V_CMPX_GE_U32::Inst_VOP3__V_CMPX_GE_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ge_u32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GE_U32
-
-    Inst_VOP3__V_CMPX_GE_U32::~Inst_VOP3__V_CMPX_GE_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_T_U32 class methods ---
-
-    Inst_VOP3__V_CMPX_T_U32::Inst_VOP3__V_CMPX_T_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_t_u32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_T_U32
-
-    Inst_VOP3__V_CMPX_T_U32::~Inst_VOP3__V_CMPX_T_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_T_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_T_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_F_I64 class methods ---
-
-    Inst_VOP3__V_CMP_F_I64::Inst_VOP3__V_CMP_F_I64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_f_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_F_I64
-
-    Inst_VOP3__V_CMP_F_I64::~Inst_VOP3__V_CMP_F_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_F_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LT_I64 class methods ---
-
-    Inst_VOP3__V_CMP_LT_I64::Inst_VOP3__V_CMP_LT_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lt_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LT_I64
-
-    Inst_VOP3__V_CMP_LT_I64::~Inst_VOP3__V_CMP_LT_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_EQ_I64 class methods ---
-
-    Inst_VOP3__V_CMP_EQ_I64::Inst_VOP3__V_CMP_EQ_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_eq_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_EQ_I64
-
-    Inst_VOP3__V_CMP_EQ_I64::~Inst_VOP3__V_CMP_EQ_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LE_I64 class methods ---
-
-    Inst_VOP3__V_CMP_LE_I64::Inst_VOP3__V_CMP_LE_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_le_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LE_I64
-
-    Inst_VOP3__V_CMP_LE_I64::~Inst_VOP3__V_CMP_LE_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GT_I64 class methods ---
-
-    Inst_VOP3__V_CMP_GT_I64::Inst_VOP3__V_CMP_GT_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_gt_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GT_I64
-
-    Inst_VOP3__V_CMP_GT_I64::~Inst_VOP3__V_CMP_GT_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NE_I64 class methods ---
-
-    Inst_VOP3__V_CMP_NE_I64::Inst_VOP3__V_CMP_NE_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ne_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_NE_I64
-
-    Inst_VOP3__V_CMP_NE_I64::~Inst_VOP3__V_CMP_NE_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_NE_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GE_I64 class methods ---
-
-    Inst_VOP3__V_CMP_GE_I64::Inst_VOP3__V_CMP_GE_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ge_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GE_I64
-
-    Inst_VOP3__V_CMP_GE_I64::~Inst_VOP3__V_CMP_GE_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_T_I64 class methods ---
-
-    Inst_VOP3__V_CMP_T_I64::Inst_VOP3__V_CMP_T_I64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_t_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_T_I64
-
-    Inst_VOP3__V_CMP_T_I64::~Inst_VOP3__V_CMP_T_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_T_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_T_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_F_U64 class methods ---
-
-    Inst_VOP3__V_CMP_F_U64::Inst_VOP3__V_CMP_F_U64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_f_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_F_U64
-
-    Inst_VOP3__V_CMP_F_U64::~Inst_VOP3__V_CMP_F_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_F_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LT_U64 class methods ---
-
-    Inst_VOP3__V_CMP_LT_U64::Inst_VOP3__V_CMP_LT_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lt_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LT_U64
-
-    Inst_VOP3__V_CMP_LT_U64::~Inst_VOP3__V_CMP_LT_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_EQ_U64 class methods ---
-
-    Inst_VOP3__V_CMP_EQ_U64::Inst_VOP3__V_CMP_EQ_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_eq_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_EQ_U64
-
-    Inst_VOP3__V_CMP_EQ_U64::~Inst_VOP3__V_CMP_EQ_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LE_U64 class methods ---
-
-    Inst_VOP3__V_CMP_LE_U64::Inst_VOP3__V_CMP_LE_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_le_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LE_U64
-
-    Inst_VOP3__V_CMP_LE_U64::~Inst_VOP3__V_CMP_LE_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GT_U64 class methods ---
-
-    Inst_VOP3__V_CMP_GT_U64::Inst_VOP3__V_CMP_GT_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_gt_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GT_U64
-
-    Inst_VOP3__V_CMP_GT_U64::~Inst_VOP3__V_CMP_GT_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NE_U64 class methods ---
-
-    Inst_VOP3__V_CMP_NE_U64::Inst_VOP3__V_CMP_NE_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ne_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_NE_U64
-
-    Inst_VOP3__V_CMP_NE_U64::~Inst_VOP3__V_CMP_NE_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_NE_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GE_U64 class methods ---
-
-    Inst_VOP3__V_CMP_GE_U64::Inst_VOP3__V_CMP_GE_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ge_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GE_U64
-
-    Inst_VOP3__V_CMP_GE_U64::~Inst_VOP3__V_CMP_GE_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_T_U64 class methods ---
-
-    Inst_VOP3__V_CMP_T_U64::Inst_VOP3__V_CMP_T_U64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_t_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_T_U64
-
-    Inst_VOP3__V_CMP_T_U64::~Inst_VOP3__V_CMP_T_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_T_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_T_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_F_I64 class methods ---
-
-    Inst_VOP3__V_CMPX_F_I64::Inst_VOP3__V_CMPX_F_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_f_i64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_F_I64
-
-    Inst_VOP3__V_CMPX_F_I64::~Inst_VOP3__V_CMPX_F_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LT_I64 class methods ---
-
-    Inst_VOP3__V_CMPX_LT_I64::Inst_VOP3__V_CMPX_LT_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lt_i64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LT_I64
-
-    Inst_VOP3__V_CMPX_LT_I64::~Inst_VOP3__V_CMPX_LT_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_EQ_I64 class methods ---
-
-    Inst_VOP3__V_CMPX_EQ_I64::Inst_VOP3__V_CMPX_EQ_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_eq_i64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_EQ_I64
-
-    Inst_VOP3__V_CMPX_EQ_I64::~Inst_VOP3__V_CMPX_EQ_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LE_I64 class methods ---
-
-    Inst_VOP3__V_CMPX_LE_I64::Inst_VOP3__V_CMPX_LE_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_le_i64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LE_I64
-
-    Inst_VOP3__V_CMPX_LE_I64::~Inst_VOP3__V_CMPX_LE_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GT_I64 class methods ---
-
-    Inst_VOP3__V_CMPX_GT_I64::Inst_VOP3__V_CMPX_GT_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_gt_i64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GT_I64
-
-    Inst_VOP3__V_CMPX_GT_I64::~Inst_VOP3__V_CMPX_GT_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NE_I64 class methods ---
-
-    Inst_VOP3__V_CMPX_NE_I64::Inst_VOP3__V_CMPX_NE_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ne_i64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NE_I64
-
-    Inst_VOP3__V_CMPX_NE_I64::~Inst_VOP3__V_CMPX_NE_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NE_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GE_I64 class methods ---
-
-    Inst_VOP3__V_CMPX_GE_I64::Inst_VOP3__V_CMPX_GE_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ge_i64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GE_I64
-
-    Inst_VOP3__V_CMPX_GE_I64::~Inst_VOP3__V_CMPX_GE_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_T_I64 class methods ---
-
-    Inst_VOP3__V_CMPX_T_I64::Inst_VOP3__V_CMPX_T_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_t_i64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_T_I64
-
-    Inst_VOP3__V_CMPX_T_I64::~Inst_VOP3__V_CMPX_T_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_T_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_T_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_F_U64 class methods ---
-
-    Inst_VOP3__V_CMPX_F_U64::Inst_VOP3__V_CMPX_F_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_f_u64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_F_U64
-
-    Inst_VOP3__V_CMPX_F_U64::~Inst_VOP3__V_CMPX_F_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LT_U64 class methods ---
-
-    Inst_VOP3__V_CMPX_LT_U64::Inst_VOP3__V_CMPX_LT_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lt_u64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LT_U64
-
-    Inst_VOP3__V_CMPX_LT_U64::~Inst_VOP3__V_CMPX_LT_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_EQ_U64 class methods ---
-
-    Inst_VOP3__V_CMPX_EQ_U64::Inst_VOP3__V_CMPX_EQ_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_eq_u64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_EQ_U64
-
-    Inst_VOP3__V_CMPX_EQ_U64::~Inst_VOP3__V_CMPX_EQ_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LE_U64 class methods ---
-
-    Inst_VOP3__V_CMPX_LE_U64::Inst_VOP3__V_CMPX_LE_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_le_u64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LE_U64
-
-    Inst_VOP3__V_CMPX_LE_U64::~Inst_VOP3__V_CMPX_LE_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GT_U64 class methods ---
-
-    Inst_VOP3__V_CMPX_GT_U64::Inst_VOP3__V_CMPX_GT_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_gt_u64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GT_U64
-
-    Inst_VOP3__V_CMPX_GT_U64::~Inst_VOP3__V_CMPX_GT_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NE_U64 class methods ---
-
-    Inst_VOP3__V_CMPX_NE_U64::Inst_VOP3__V_CMPX_NE_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ne_u64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NE_U64
-
-    Inst_VOP3__V_CMPX_NE_U64::~Inst_VOP3__V_CMPX_NE_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NE_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GE_U64 class methods ---
-
-    Inst_VOP3__V_CMPX_GE_U64::Inst_VOP3__V_CMPX_GE_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ge_u64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GE_U64
-
-    Inst_VOP3__V_CMPX_GE_U64::~Inst_VOP3__V_CMPX_GE_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_T_U64 class methods ---
-
-    Inst_VOP3__V_CMPX_T_U64::Inst_VOP3__V_CMPX_T_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_t_u64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_T_U64
-
-    Inst_VOP3__V_CMPX_T_U64::~Inst_VOP3__V_CMPX_T_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_T_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_T_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CNDMASK_B32 class methods ---
-
-    Inst_VOP3__V_CNDMASK_B32::Inst_VOP3__V_CNDMASK_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cndmask_b32", false)
-    {
-        setFlag(ALU);
-        setFlag(ReadsVCC);
-    } // Inst_VOP3__V_CNDMASK_B32
-
-    Inst_VOP3__V_CNDMASK_B32::~Inst_VOP3__V_CNDMASK_B32()
-    {
-    } // ~Inst_VOP3__V_CNDMASK_B32
-
-    // --- description from .arch file ---
-    // D.u = (VCC[i] ? S1.u : S0.u) (i = threadID in wave); VOP3: specify VCC
-    // as a scalar GPR in S2.
-    void
-    Inst_VOP3__V_CNDMASK_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        vcc.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = bits(vcc.rawData(), lane)
-                    ? src1[lane] : src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ADD_F32 class methods ---
-
-    Inst_VOP3__V_ADD_F32::Inst_VOP3__V_ADD_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_add_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_ADD_F32
-
-    Inst_VOP3__V_ADD_F32::~Inst_VOP3__V_ADD_F32()
-    {
-    } // ~Inst_VOP3__V_ADD_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f + S1.f.
-    void
-    Inst_VOP3__V_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SUB_F32 class methods ---
-
-    Inst_VOP3__V_SUB_F32::Inst_VOP3__V_SUB_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sub_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_SUB_F32
-
-    Inst_VOP3__V_SUB_F32::~Inst_VOP3__V_SUB_F32()
-    {
-    } // ~Inst_VOP3__V_SUB_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f - S1.f.
-    // SQ translates to V_ADD_F32.
-    void
-    Inst_VOP3__V_SUB_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SUBREV_F32 class methods ---
-
-    Inst_VOP3__V_SUBREV_F32::Inst_VOP3__V_SUBREV_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_subrev_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_SUBREV_F32
-
-    Inst_VOP3__V_SUBREV_F32::~Inst_VOP3__V_SUBREV_F32()
-    {
-    } // ~Inst_VOP3__V_SUBREV_F32
-
-    // --- description from .arch file ---
-    // D.f = S1.f - S0.f.
-    // SQ translates to V_ADD_F32.
-    void
-    Inst_VOP3__V_SUBREV_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_LEGACY_F32 class methods ---
-
-    Inst_VOP3__V_MUL_LEGACY_F32::Inst_VOP3__V_MUL_LEGACY_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_legacy_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_MUL_LEGACY_F32
-
-    Inst_VOP3__V_MUL_LEGACY_F32::~Inst_VOP3__V_MUL_LEGACY_F32()
-    {
-    } // ~Inst_VOP3__V_MUL_LEGACY_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f (DX9 rules, 0.0*x = 0.0).
-    void
-    Inst_VOP3__V_MUL_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isnan(src0[lane]) ||
-                    std::isnan(src1[lane])) {
-                    vdst[lane] = NAN;
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else {
-                    vdst[lane] = src0[lane] * src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_F32 class methods ---
-
-    Inst_VOP3__V_MUL_F32::Inst_VOP3__V_MUL_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_MUL_F32
-
-    Inst_VOP3__V_MUL_F32::~Inst_VOP3__V_MUL_F32()
-    {
-    } // ~Inst_VOP3__V_MUL_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f.
-    void
-    Inst_VOP3__V_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isnan(src0[lane]) ||
-                    std::isnan(src1[lane])) {
-                    vdst[lane] = NAN;
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else {
-                    vdst[lane] = src0[lane] * src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_I32_I24 class methods ---
-
-    Inst_VOP3__V_MUL_I32_I24::Inst_VOP3__V_MUL_I32_I24(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_i32_i24", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_I32_I24
-
-    Inst_VOP3__V_MUL_I32_I24::~Inst_VOP3__V_MUL_I32_I24()
-    {
-    } // ~Inst_VOP3__V_MUL_I32_I24
-
-    // --- description from .arch file ---
-    // D.i = S0.i[23:0] * S1.i[23:0].
-    void
-    Inst_VOP3__V_MUL_I32_I24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
-                    * sext<24>(bits(src1[lane], 23, 0));
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_HI_I32_I24 class methods ---
-
-    Inst_VOP3__V_MUL_HI_I32_I24::Inst_VOP3__V_MUL_HI_I32_I24(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_hi_i32_i24", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_HI_I32_I24
-
-    Inst_VOP3__V_MUL_HI_I32_I24::~Inst_VOP3__V_MUL_HI_I32_I24()
-    {
-    } // ~Inst_VOP3__V_MUL_HI_I32_I24
-
-    // --- description from .arch file ---
-    // D.i = (S0.i[23:0] * S1.i[23:0])>>32.
-    void
-    Inst_VOP3__V_MUL_HI_I32_I24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI64 tmp_src0
-                    = (VecElemI64)sext<24>(bits(src0[lane], 23, 0));
-                VecElemI64 tmp_src1
-                    = (VecElemI64)sext<24>(bits(src1[lane], 23, 0));
-
-                vdst[lane] = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_U32_U24 class methods ---
-
-    Inst_VOP3__V_MUL_U32_U24::Inst_VOP3__V_MUL_U32_U24(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_u32_u24", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_U32_U24
-
-    Inst_VOP3__V_MUL_U32_U24::~Inst_VOP3__V_MUL_U32_U24()
-    {
-    } // ~Inst_VOP3__V_MUL_U32_U24
-
-    // --- description from .arch file ---
-    // D.u = S0.u[23:0] * S1.u[23:0].
-    void
-    Inst_VOP3__V_MUL_U32_U24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_HI_U32_U24 class methods ---
-
-    Inst_VOP3__V_MUL_HI_U32_U24::Inst_VOP3__V_MUL_HI_U32_U24(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_hi_u32_u24", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_HI_U32_U24
-
-    Inst_VOP3__V_MUL_HI_U32_U24::~Inst_VOP3__V_MUL_HI_U32_U24()
-    {
-    } // ~Inst_VOP3__V_MUL_HI_U32_U24
-
-    // --- description from .arch file ---
-    // D.i = (S0.u[23:0] * S1.u[23:0])>>32.
-    void
-    Inst_VOP3__V_MUL_HI_U32_U24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemU64 tmp_src0 = (VecElemU64)bits(src0[lane], 23, 0);
-                VecElemU64 tmp_src1 = (VecElemU64)bits(src1[lane], 23, 0);
-                vdst[lane] = (VecElemU32)((tmp_src0 * tmp_src1) >> 32);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MIN_F32 class methods ---
-
-    Inst_VOP3__V_MIN_F32::Inst_VOP3__V_MIN_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_MIN_F32
-
-    Inst_VOP3__V_MIN_F32::~Inst_VOP3__V_MIN_F32()
-    {
-    } // ~Inst_VOP3__V_MIN_F32
-
-    // --- description from .arch file ---
-    // D.f = (S0.f < S1.f ? S0.f : S1.f).
-    void
-    Inst_VOP3__V_MIN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fmin(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAX_F32 class methods ---
-
-    Inst_VOP3__V_MAX_F32::Inst_VOP3__V_MAX_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_MAX_F32
-
-    Inst_VOP3__V_MAX_F32::~Inst_VOP3__V_MAX_F32()
-    {
-    } // ~Inst_VOP3__V_MAX_F32
-
-    // --- description from .arch file ---
-    // D.f = (S0.f >= S1.f ? S0.f : S1.f).
-    void
-    Inst_VOP3__V_MAX_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fmax(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MIN_I32 class methods ---
-
-    Inst_VOP3__V_MIN_I32::Inst_VOP3__V_MIN_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MIN_I32
-
-    Inst_VOP3__V_MIN_I32::~Inst_VOP3__V_MIN_I32()
-    {
-    } // ~Inst_VOP3__V_MIN_I32
-
-    // --- description from .arch file ---
-    // D.i = min(S0.i, S1.i).
-    void
-    Inst_VOP3__V_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAX_I32 class methods ---
-
-    Inst_VOP3__V_MAX_I32::Inst_VOP3__V_MAX_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MAX_I32
-
-    Inst_VOP3__V_MAX_I32::~Inst_VOP3__V_MAX_I32()
-    {
-    } // ~Inst_VOP3__V_MAX_I32
-
-    // --- description from .arch file ---
-    // D.i = max(S0.i, S1.i).
-    void
-    Inst_VOP3__V_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MIN_U32 class methods ---
-
-    Inst_VOP3__V_MIN_U32::Inst_VOP3__V_MIN_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MIN_U32
-
-    Inst_VOP3__V_MIN_U32::~Inst_VOP3__V_MIN_U32()
-    {
-    } // ~Inst_VOP3__V_MIN_U32
-
-    // --- description from .arch file ---
-    // D.u = min(S0.u, S1.u).
-    void
-    Inst_VOP3__V_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAX_U32 class methods ---
-
-    Inst_VOP3__V_MAX_U32::Inst_VOP3__V_MAX_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MAX_U32
-
-    Inst_VOP3__V_MAX_U32::~Inst_VOP3__V_MAX_U32()
-    {
-    } // ~Inst_VOP3__V_MAX_U32
-
-    // --- description from .arch file ---
-    // D.u = max(S0.u, S1.u).
-    void
-    Inst_VOP3__V_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LSHRREV_B32 class methods ---
-
-    Inst_VOP3__V_LSHRREV_B32::Inst_VOP3__V_LSHRREV_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_lshrrev_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHRREV_B32
-
-    Inst_VOP3__V_LSHRREV_B32::~Inst_VOP3__V_LSHRREV_B32()
-    {
-    } // ~Inst_VOP3__V_LSHRREV_B32
-
-    // --- description from .arch file ---
-    // D.u = S1.u >> S0.u[4:0].
-    // The vacated bits are set to zero.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP3__V_LSHRREV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ASHRREV_I32 class methods ---
-
-    Inst_VOP3__V_ASHRREV_I32::Inst_VOP3__V_ASHRREV_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ashrrev_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ASHRREV_I32
-
-    Inst_VOP3__V_ASHRREV_I32::~Inst_VOP3__V_ASHRREV_I32()
-    {
-    } // ~Inst_VOP3__V_ASHRREV_I32
-
-    // --- description from .arch file ---
-    // D.i = signext(S1.i) >> S0.i[4:0].
-    // The vacated bits are set to the sign bit of the input value.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP3__V_ASHRREV_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LSHLREV_B32 class methods ---
-
-    Inst_VOP3__V_LSHLREV_B32::Inst_VOP3__V_LSHLREV_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_lshlrev_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHLREV_B32
-
-    Inst_VOP3__V_LSHLREV_B32::~Inst_VOP3__V_LSHLREV_B32()
-    {
-    } // ~Inst_VOP3__V_LSHLREV_B32
-
-    // --- description from .arch file ---
-    // D.u = S1.u << S0.u[4:0].
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP3__V_LSHLREV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_AND_B32 class methods ---
-
-    Inst_VOP3__V_AND_B32::Inst_VOP3__V_AND_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_and_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_AND_B32
-
-    Inst_VOP3__V_AND_B32::~Inst_VOP3__V_AND_B32()
-    {
-    } // ~Inst_VOP3__V_AND_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u & S1.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP3__V_AND_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] & src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_OR_B32 class methods ---
-
-    Inst_VOP3__V_OR_B32::Inst_VOP3__V_OR_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_or_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_OR_B32
-
-    Inst_VOP3__V_OR_B32::~Inst_VOP3__V_OR_B32()
-    {
-    } // ~Inst_VOP3__V_OR_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u | S1.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP3__V_OR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] | src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_OR3_B32 class methods ---
-
-    Inst_VOP3__V_OR3_B32::Inst_VOP3__V_OR3_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_or3_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_OR3_B32
-
-    Inst_VOP3__V_OR3_B32::~Inst_VOP3__V_OR3_B32()
-    {
-    } // ~Inst_VOP3__V_OR3_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u | S1.u | S2.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP3__V_OR3_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] | src1[lane] | src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_XOR_B32 class methods ---
-
-    Inst_VOP3__V_XOR_B32::Inst_VOP3__V_XOR_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_xor_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_XOR_B32
-
-    Inst_VOP3__V_XOR_B32::~Inst_VOP3__V_XOR_B32()
-    {
-    } // ~Inst_VOP3__V_XOR_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u ^ S1.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP3__V_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] ^ src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAC_F32 class methods ---
-
-    Inst_VOP3__V_MAC_F32::Inst_VOP3__V_MAC_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mac_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(MAC);
-    } // Inst_VOP3__V_MAC_F32
-
-    Inst_VOP3__V_MAC_F32::~Inst_VOP3__V_MAC_F32()
-    {
-    } // ~Inst_VOP3__V_MAC_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f + D.f.
-    // SQ translates to V_MAD_F32.
-    void
-    Inst_VOP3__V_MAC_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        vdst.read();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ADD_CO_U32 class methods ---
-
-    Inst_VOP3__V_ADD_CO_U32::Inst_VOP3__V_ADD_CO_U32(InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_add_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-    } // Inst_VOP3__V_ADD_CO_U32
-
-    Inst_VOP3__V_ADD_CO_U32::~Inst_VOP3__V_ADD_CO_U32()
-    {
-    } // ~Inst_VOP3__V_ADD_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u + S1.u;
-    // VCC[threadId] = (S0.u + S1.u >= 0x800000000ULL ? 1 : 0) is an UNSIGNED
-    // ---  overflow or carry-out for V_ADDC_U32.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
-    void
-    Inst_VOP3__V_ADD_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane];
-                vcc.setBit(lane, ((VecElemU64)src0[lane]
-                    + (VecElemU64)src1[lane]) >= 0x100000000ULL ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    } // execute
-    // --- Inst_VOP3__V_SUB_CO_U32 class methods ---
-
-    Inst_VOP3__V_SUB_CO_U32::Inst_VOP3__V_SUB_CO_U32(InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_sub_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-    } // Inst_VOP3__V_SUB_CO_U32
-
-    Inst_VOP3__V_SUB_CO_U32::~Inst_VOP3__V_SUB_CO_U32()
-    {
-    } // ~Inst_VOP3__V_SUB_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u - S1.u;
-    // VCC[threadId] = (S1.u > S0.u ? 1 : 0) is an UNSIGNED overflow or
-    // carry-out for V_SUBB_U32.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
-    void
-    Inst_VOP3__V_SUB_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-                vcc.setBit(lane, src1[lane] > src0[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    } // execute
-    // --- Inst_VOP3__V_SUBREV_CO_U32 class methods ---
-
-    Inst_VOP3__V_SUBREV_CO_U32::Inst_VOP3__V_SUBREV_CO_U32(
-          InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_subrev_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-    } // Inst_VOP3__V_SUBREV_CO_U32
-
-    Inst_VOP3__V_SUBREV_CO_U32::~Inst_VOP3__V_SUBREV_CO_U32()
-    {
-    } // ~Inst_VOP3__V_SUBREV_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S1.u - S0.u;
-    // VCC[threadId] = (S0.u > S1.u ? 1 : 0) is an UNSIGNED overflow or
-    // carry-out for V_SUBB_U32.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
-    // SQ translates this to V_SUB_U32 with reversed operands.
-    void
-    Inst_VOP3__V_SUBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    } // execute
-    // --- Inst_VOP3__V_ADDC_CO_U32 class methods ---
-
-    Inst_VOP3__V_ADDC_CO_U32::Inst_VOP3__V_ADDC_CO_U32(InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_addc_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(ReadsVCC);
-    } // Inst_VOP3__V_ADDC_CO_U32
-
-    Inst_VOP3__V_ADDC_CO_U32::~Inst_VOP3__V_ADDC_CO_U32()
-    {
-    } // ~Inst_VOP3__V_ADDC_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u + S1.u + VCC[threadId];
-    // VCC[threadId] = (S0.u + S1.u + VCC[threadId] >= 0x800000000ULL ? 1 : 0)
-    // is an UNSIGNED overflow.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
-    // source comes from the SGPR-pair at S2.u.
-    void
-    Inst_VOP3__V_ADDC_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        vcc.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane]
-                    + bits(vcc.rawData(), lane);
-                sdst.setBit(lane, ((VecElemU64)src0[lane]
-                    + (VecElemU64)src1[lane]
-                        + (VecElemU64)bits(vcc.rawData(), lane))
-                            >= 0x100000000 ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SUBB_CO_U32 class methods ---
-
-    Inst_VOP3__V_SUBB_CO_U32::Inst_VOP3__V_SUBB_CO_U32(InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_subb_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(ReadsVCC);
-    } // Inst_VOP3__V_SUBB_CO_U32
-
-    Inst_VOP3__V_SUBB_CO_U32::~Inst_VOP3__V_SUBB_CO_U32()
-    {
-    } // ~Inst_VOP3__V_SUBB_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u - S1.u - VCC[threadId];
-    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
-    // ---  overflow.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
-    // ---  source comes from the SGPR-pair at S2.u.
-    void
-    Inst_VOP3__V_SUBB_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        vcc.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane]
-                    - bits(vcc.rawData(), lane);
-                sdst.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
-                    > src0[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SUBBREV_CO_U32 class methods ---
-
-    Inst_VOP3__V_SUBBREV_CO_U32::Inst_VOP3__V_SUBBREV_CO_U32(
-          InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_subbrev_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(ReadsVCC);
-    } // Inst_VOP3__V_SUBBREV_CO_U32
-
-    Inst_VOP3__V_SUBBREV_CO_U32::~Inst_VOP3__V_SUBBREV_CO_U32()
-    {
-    } // ~Inst_VOP3__V_SUBBREV_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S1.u - S0.u - VCC[threadId];
-    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
-    // overflow.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
-    // source comes from the SGPR-pair at S2.u. SQ translates to V_SUBB_U32.
-    void
-    Inst_VOP3__V_SUBBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        vcc.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane]
-                    - bits(vcc.rawData(), lane);
-                sdst.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
-                    > src0[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ADD_F16 class methods ---
-
-    Inst_VOP3__V_ADD_F16::Inst_VOP3__V_ADD_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_add_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_ADD_F16
-
-    Inst_VOP3__V_ADD_F16::~Inst_VOP3__V_ADD_F16()
-    {
-    } // ~Inst_VOP3__V_ADD_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 + S1.f16.
-    // Supports denormals, round mode, exception flags, saturation.
-    void
-    Inst_VOP3__V_ADD_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_SUB_F16 class methods ---
-
-    Inst_VOP3__V_SUB_F16::Inst_VOP3__V_SUB_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sub_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_SUB_F16
-
-    Inst_VOP3__V_SUB_F16::~Inst_VOP3__V_SUB_F16()
-    {
-    } // ~Inst_VOP3__V_SUB_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 - S1.f16.
-    // Supports denormals, round mode, exception flags, saturation.
-    // SQ translates to V_ADD_F16.
-    void
-    Inst_VOP3__V_SUB_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_SUBREV_F16 class methods ---
-
-    Inst_VOP3__V_SUBREV_F16::Inst_VOP3__V_SUBREV_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_subrev_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_SUBREV_F16
-
-    Inst_VOP3__V_SUBREV_F16::~Inst_VOP3__V_SUBREV_F16()
-    {
-    } // ~Inst_VOP3__V_SUBREV_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S1.f16 - S0.f16.
-    // Supports denormals, round mode, exception flags, saturation.
-    // SQ translates to V_ADD_F16.
-    void
-    Inst_VOP3__V_SUBREV_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_MUL_F16 class methods ---
-
-    Inst_VOP3__V_MUL_F16::Inst_VOP3__V_MUL_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_MUL_F16
-
-    Inst_VOP3__V_MUL_F16::~Inst_VOP3__V_MUL_F16()
-    {
-    } // ~Inst_VOP3__V_MUL_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * S1.f16.
-    // Supports denormals, round mode, exception flags, saturation.
-    void
-    Inst_VOP3__V_MUL_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_MAC_F16 class methods ---
-
-    Inst_VOP3__V_MAC_F16::Inst_VOP3__V_MAC_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mac_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(MAC);
-    } // Inst_VOP3__V_MAC_F16
-
-    Inst_VOP3__V_MAC_F16::~Inst_VOP3__V_MAC_F16()
-    {
-    } // ~Inst_VOP3__V_MAC_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * S1.f16 + D.f16.
-    // Supports round mode, exception flags, saturation.
-    // SQ translates this to V_MAD_F16.
-    void
-    Inst_VOP3__V_MAC_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_ADD_U16 class methods ---
-
-    Inst_VOP3__V_ADD_U16::Inst_VOP3__V_ADD_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_add_u16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ADD_U16
-
-    Inst_VOP3__V_ADD_U16::~Inst_VOP3__V_ADD_U16()
-    {
-    } // ~Inst_VOP3__V_ADD_U16
-
-    // --- description from .arch file ---
-    // D.u16 = S0.u16 + S1.u16.
-    // Supports saturation (unsigned 16-bit integer domain).
-    void
-    Inst_VOP3__V_ADD_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SUB_U16 class methods ---
-
-    Inst_VOP3__V_SUB_U16::Inst_VOP3__V_SUB_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sub_u16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SUB_U16
-
-    Inst_VOP3__V_SUB_U16::~Inst_VOP3__V_SUB_U16()
-    {
-    } // ~Inst_VOP3__V_SUB_U16
-
-    // --- description from .arch file ---
-    // D.u16 = S0.u16 - S1.u16.
-    // Supports saturation (unsigned 16-bit integer domain).
-    void
-    Inst_VOP3__V_SUB_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SUBREV_U16 class methods ---
-
-    Inst_VOP3__V_SUBREV_U16::Inst_VOP3__V_SUBREV_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_subrev_u16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SUBREV_U16
-
-    Inst_VOP3__V_SUBREV_U16::~Inst_VOP3__V_SUBREV_U16()
-    {
-    } // ~Inst_VOP3__V_SUBREV_U16
-
-    // --- description from .arch file ---
-    // D.u16 = S1.u16 - S0.u16.
-    // Supports saturation (unsigned 16-bit integer domain).
-    // SQ translates this to V_SUB_U16 with reversed operands.
-    void
-    Inst_VOP3__V_SUBREV_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_LO_U16 class methods ---
-
-    Inst_VOP3__V_MUL_LO_U16::Inst_VOP3__V_MUL_LO_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_lo_u16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_LO_U16
-
-    Inst_VOP3__V_MUL_LO_U16::~Inst_VOP3__V_MUL_LO_U16()
-    {
-    } // ~Inst_VOP3__V_MUL_LO_U16
-
-    // --- description from .arch file ---
-    // D.u16 = S0.u16 * S1.u16.
-    // Supports saturation (unsigned 16-bit integer domain).
-    void
-    Inst_VOP3__V_MUL_LO_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] * src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LSHLREV_B16 class methods ---
-
-    Inst_VOP3__V_LSHLREV_B16::Inst_VOP3__V_LSHLREV_B16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_lshlrev_b16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHLREV_B16
-
-    Inst_VOP3__V_LSHLREV_B16::~Inst_VOP3__V_LSHLREV_B16()
-    {
-    } // ~Inst_VOP3__V_LSHLREV_B16
-
-    // --- description from .arch file ---
-    // D.u[15:0] = S1.u[15:0] << S0.u[3:0].
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP3__V_LSHLREV_B16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] << bits(src0[lane], 3, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LSHRREV_B16 class methods ---
-
-    Inst_VOP3__V_LSHRREV_B16::Inst_VOP3__V_LSHRREV_B16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_lshrrev_b16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHRREV_B16
-
-    Inst_VOP3__V_LSHRREV_B16::~Inst_VOP3__V_LSHRREV_B16()
-    {
-    } // ~Inst_VOP3__V_LSHRREV_B16
-
-    // --- description from .arch file ---
-    // D.u[15:0] = S1.u[15:0] >> S0.u[3:0].
-    // The vacated bits are set to zero.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP3__V_LSHRREV_B16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> bits(src0[lane], 3, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ASHRREV_I16 class methods ---
-
-    Inst_VOP3__V_ASHRREV_I16::Inst_VOP3__V_ASHRREV_I16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ashrrev_i16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ASHRREV_I16
-
-    Inst_VOP3__V_ASHRREV_I16::~Inst_VOP3__V_ASHRREV_I16()
-    {
-    } // ~Inst_VOP3__V_ASHRREV_I16
-
-    // --- description from .arch file ---
-    // D.i[15:0] = signext(S1.i[15:0]) >> S0.i[3:0].
-    // The vacated bits are set to the sign bit of the input value.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP3__V_ASHRREV_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        VecOperandI16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> bits(src0[lane], 3, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAX_F16 class methods ---
-
-    Inst_VOP3__V_MAX_F16::Inst_VOP3__V_MAX_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_MAX_F16
-
-    Inst_VOP3__V_MAX_F16::~Inst_VOP3__V_MAX_F16()
-    {
-    } // ~Inst_VOP3__V_MAX_F16
-
-    // --- description from .arch file ---
-    // D.f16 = max(S0.f16, S1.f16).
-    // IEEE compliant. Supports denormals, round mode, exception flags,
-    // saturation.
-    void
-    Inst_VOP3__V_MAX_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_MIN_F16 class methods ---
-
-    Inst_VOP3__V_MIN_F16::Inst_VOP3__V_MIN_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_MIN_F16
-
-    Inst_VOP3__V_MIN_F16::~Inst_VOP3__V_MIN_F16()
-    {
-    } // ~Inst_VOP3__V_MIN_F16
-
-    // --- description from .arch file ---
-    // D.f16 = min(S0.f16, S1.f16).
-    // IEEE compliant. Supports denormals, round mode, exception flags,
-    // saturation.
-    void
-    Inst_VOP3__V_MIN_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_MAX_U16 class methods ---
-
-    Inst_VOP3__V_MAX_U16::Inst_VOP3__V_MAX_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max_u16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MAX_U16
-
-    Inst_VOP3__V_MAX_U16::~Inst_VOP3__V_MAX_U16()
-    {
-    } // ~Inst_VOP3__V_MAX_U16
-
-    // --- description from .arch file ---
-    // D.u[15:0] = max(S0.u[15:0], S1.u[15:0]).
-    void
-    Inst_VOP3__V_MAX_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAX_I16 class methods ---
-
-    Inst_VOP3__V_MAX_I16::Inst_VOP3__V_MAX_I16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max_i16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MAX_I16
-
-    Inst_VOP3__V_MAX_I16::~Inst_VOP3__V_MAX_I16()
-    {
-    } // ~Inst_VOP3__V_MAX_I16
-
-    // --- description from .arch file ---
-    // D.i[15:0] = max(S0.i[15:0], S1.i[15:0]).
-    void
-    Inst_VOP3__V_MAX_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        VecOperandI16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MIN_U16 class methods ---
-
-    Inst_VOP3__V_MIN_U16::Inst_VOP3__V_MIN_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min_u16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MIN_U16
-
-    Inst_VOP3__V_MIN_U16::~Inst_VOP3__V_MIN_U16()
-    {
-    } // ~Inst_VOP3__V_MIN_U16
-
-    // --- description from .arch file ---
-    // D.u[15:0] = min(S0.u[15:0], S1.u[15:0]).
-    void
-    Inst_VOP3__V_MIN_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MIN_I16 class methods ---
-
-    Inst_VOP3__V_MIN_I16::Inst_VOP3__V_MIN_I16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min_i16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MIN_I16
-
-    Inst_VOP3__V_MIN_I16::~Inst_VOP3__V_MIN_I16()
-    {
-    } // ~Inst_VOP3__V_MIN_I16
-
-    // --- description from .arch file ---
-    // D.i[15:0] = min(S0.i[15:0], S1.i[15:0]).
-    void
-    Inst_VOP3__V_MIN_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        VecOperandI16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LDEXP_F16 class methods ---
-
-    Inst_VOP3__V_LDEXP_F16::Inst_VOP3__V_LDEXP_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ldexp_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_LDEXP_F16
-
-    Inst_VOP3__V_LDEXP_F16::~Inst_VOP3__V_LDEXP_F16()
-    {
-    } // ~Inst_VOP3__V_LDEXP_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * (2 ** S1.i16).
-    void
-    Inst_VOP3__V_LDEXP_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_ADD_U32 class methods ---
-
-    Inst_VOP3__V_ADD_U32::Inst_VOP3__V_ADD_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_add_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ADD_U32
-
-    Inst_VOP3__V_ADD_U32::~Inst_VOP3__V_ADD_U32()
-    {
-    } // ~Inst_VOP3__V_ADD_U32
-
-    // --- description from .arch file ---
-    // D.u32 = S0.u32 + S1.u32.
-    void
-    Inst_VOP3__V_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SUB_U32 class methods ---
-
-    Inst_VOP3__V_SUB_U32::Inst_VOP3__V_SUB_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sub_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SUB_U32
-
-    Inst_VOP3__V_SUB_U32::~Inst_VOP3__V_SUB_U32()
-    {
-    } // ~Inst_VOP3__V_SUB_U32
-
-    // --- description from .arch file ---
-    // D.u32 = S0.u32 - S1.u32.
-    void
-    Inst_VOP3__V_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SUBREV_U32 class methods ---
-
-    Inst_VOP3__V_SUBREV_U32::Inst_VOP3__V_SUBREV_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_subrev_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SUBREV_U32
-
-    Inst_VOP3__V_SUBREV_U32::~Inst_VOP3__V_SUBREV_U32()
-    {
-    } // ~Inst_VOP3__V_SUBREV_U32
-
-    // --- description from .arch file ---
-    // D.u32 = S1.u32 - S0.u32.
-    void
-    Inst_VOP3__V_SUBREV_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_NOP class methods ---
-
-    Inst_VOP3__V_NOP::Inst_VOP3__V_NOP(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_nop", false)
-    {
-        setFlag(Nop);
-        setFlag(ALU);
-    } // Inst_VOP3__V_NOP
-
-    Inst_VOP3__V_NOP::~Inst_VOP3__V_NOP()
-    {
-    } // ~Inst_VOP3__V_NOP
-
-    // --- description from .arch file ---
-    // Do nothing.
-    void
-    Inst_VOP3__V_NOP::execute(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_VOP3__V_MOV_B32 class methods ---
-
-    Inst_VOP3__V_MOV_B32::Inst_VOP3__V_MOV_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mov_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MOV_B32
-
-    Inst_VOP3__V_MOV_B32::~Inst_VOP3__V_MOV_B32()
-    {
-    } // ~Inst_VOP3__V_MOV_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u.
-    // Input and output modifiers not supported; this is an untyped operation.
-    void
-    Inst_VOP3__V_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_I32_F64 class methods ---
-
-    Inst_VOP3__V_CVT_I32_F64::Inst_VOP3__V_CVT_I32_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_i32_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CVT_I32_F64
-
-    Inst_VOP3__V_CVT_I32_F64::~Inst_VOP3__V_CVT_I32_F64()
-    {
-    } // ~Inst_VOP3__V_CVT_I32_F64
-
-    // --- description from .arch file ---
-    // D.i = (int)S0.d.
-    // Out-of-range floating point values (including infinity) saturate. NaN is
-    // ---  converted to 0.
-    void
-    Inst_VOP3__V_CVT_I32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane]) || exp > 30) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = INT_MIN;
-                    } else {
-                        vdst[lane] = INT_MAX;
-                    }
-                } else {
-                    vdst[lane] = (VecElemI32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F64_I32 class methods ---
-
-    Inst_VOP3__V_CVT_F64_I32::Inst_VOP3__V_CVT_F64_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f64_i32", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CVT_F64_I32
-
-    Inst_VOP3__V_CVT_F64_I32::~Inst_VOP3__V_CVT_F64_I32()
-    {
-    } // ~Inst_VOP3__V_CVT_F64_I32
-
-    // --- description from .arch file ---
-    // D.d = (double)S0.i.
-    void
-    Inst_VOP3__V_CVT_F64_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF64)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F32_I32 class methods ---
-
-    Inst_VOP3__V_CVT_F32_I32::Inst_VOP3__V_CVT_F32_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f32_i32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F32_I32
-
-    Inst_VOP3__V_CVT_F32_I32::~Inst_VOP3__V_CVT_F32_I32()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_I32
-
-    // --- description from .arch file ---
-    // D.f = (float)S0.i.
-    void
-    Inst_VOP3__V_CVT_F32_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        VecOperandI32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F32_U32 class methods ---
-
-    Inst_VOP3__V_CVT_F32_U32::Inst_VOP3__V_CVT_F32_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f32_u32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F32_U32
-
-    Inst_VOP3__V_CVT_F32_U32::~Inst_VOP3__V_CVT_F32_U32()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_U32
-
-    // --- description from .arch file ---
-    // D.f = (float)S0.u.
-    void
-    Inst_VOP3__V_CVT_F32_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_U32_F32 class methods ---
-
-    Inst_VOP3__V_CVT_U32_F32::Inst_VOP3__V_CVT_U32_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_u32_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_U32_F32
-
-    Inst_VOP3__V_CVT_U32_F32::~Inst_VOP3__V_CVT_U32_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_U32_F32
-
-    // --- description from .arch file ---
-    // D.u = (unsigned)S0.f.
-    // Out-of-range floating point values (including infinity) saturate. NaN is
-    // ---  converted to 0.
-    void
-    Inst_VOP3__V_CVT_U32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane])) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = 0;
-                    } else {
-                        vdst[lane] = UINT_MAX;
-                    }
-                } else if (exp > 31) {
-                    vdst[lane] = UINT_MAX;
-                } else {
-                    vdst[lane] = (VecElemU32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_I32_F32 class methods ---
-
-    Inst_VOP3__V_CVT_I32_F32::Inst_VOP3__V_CVT_I32_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_i32_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_I32_F32
-
-    Inst_VOP3__V_CVT_I32_F32::~Inst_VOP3__V_CVT_I32_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_I32_F32
-
-    // --- description from .arch file ---
-    // D.i = (int)S0.f.
-    // Out-of-range floating point values (including infinity) saturate. NaN is
-    // ---  converted to 0.
-    void
-    Inst_VOP3__V_CVT_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane]) || exp > 30) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = INT_MIN;
-                    } else {
-                        vdst[lane] = INT_MAX;
-                    }
-                } else {
-                    vdst[lane] = (VecElemI32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MOV_FED_B32 class methods ---
-
-    Inst_VOP3__V_MOV_FED_B32::Inst_VOP3__V_MOV_FED_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mov_fed_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MOV_FED_B32
-
-    Inst_VOP3__V_MOV_FED_B32::~Inst_VOP3__V_MOV_FED_B32()
-    {
-    } // ~Inst_VOP3__V_MOV_FED_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u;
-    // Introduce EDC double error upon write to dest vgpr without causing an
-    // ---  exception.
-    // Input and output modifiers not supported; this is an untyped operation.
-    void
-    Inst_VOP3__V_MOV_FED_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F16_F32 class methods ---
-
-    Inst_VOP3__V_CVT_F16_F32::Inst_VOP3__V_CVT_F16_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f16_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F16_F32
-
-    Inst_VOP3__V_CVT_F16_F32::~Inst_VOP3__V_CVT_F16_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_F16_F32
-
-    // --- description from .arch file ---
-    // D.f16 = flt32_to_flt16(S0.f).
-    // Supports input modifiers and creates FP16 denormals when appropriate.
-    void
-    Inst_VOP3__V_CVT_F16_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F32_F16 class methods ---
-
-    Inst_VOP3__V_CVT_F32_F16::Inst_VOP3__V_CVT_F32_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f32_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F32_F16
-
-    Inst_VOP3__V_CVT_F32_F16::~Inst_VOP3__V_CVT_F32_F16()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_F16
-
-    // --- description from .arch file ---
-    // D.f = flt16_to_flt32(S0.f16).
-    // FP16 denormal inputs are always accepted.
-    void
-    Inst_VOP3__V_CVT_F32_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_RPI_I32_F32 class methods ---
-
-    Inst_VOP3__V_CVT_RPI_I32_F32::Inst_VOP3__V_CVT_RPI_I32_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_rpi_i32_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_RPI_I32_F32
-
-    Inst_VOP3__V_CVT_RPI_I32_F32::~Inst_VOP3__V_CVT_RPI_I32_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_RPI_I32_F32
-
-    // --- description from .arch file ---
-    // D.i = (int)floor(S0.f + 0.5).
-    void
-    Inst_VOP3__V_CVT_RPI_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemI32)std::floor(src[lane] + 0.5);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_FLR_I32_F32 class methods ---
-
-    Inst_VOP3__V_CVT_FLR_I32_F32::Inst_VOP3__V_CVT_FLR_I32_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_flr_i32_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_FLR_I32_F32
-
-    Inst_VOP3__V_CVT_FLR_I32_F32::~Inst_VOP3__V_CVT_FLR_I32_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_FLR_I32_F32
-
-    // --- description from .arch file ---
-    // D.i = (int)floor(S0.f).
-    void
-    Inst_VOP3__V_CVT_FLR_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemI32)std::floor(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_OFF_F32_I4 class methods ---
-
-    Inst_VOP3__V_CVT_OFF_F32_I4::Inst_VOP3__V_CVT_OFF_F32_I4(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_off_f32_i4", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_OFF_F32_I4
-
-    Inst_VOP3__V_CVT_OFF_F32_I4::~Inst_VOP3__V_CVT_OFF_F32_I4()
-    {
-    } // ~Inst_VOP3__V_CVT_OFF_F32_I4
-
-    // --- description from .arch file ---
-    // 4-bit signed int to 32-bit float. Used for interpolation in shader.
-    void
-    Inst_VOP3__V_CVT_OFF_F32_I4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        // Could not parse sq_uc.arch desc field
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F32_F64 class methods ---
-
-    Inst_VOP3__V_CVT_F32_F64::Inst_VOP3__V_CVT_F32_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f32_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CVT_F32_F64
-
-    Inst_VOP3__V_CVT_F32_F64::~Inst_VOP3__V_CVT_F32_F64()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_F64
-
-    // --- description from .arch file ---
-    // D.f = (float)S0.d.
-    void
-    Inst_VOP3__V_CVT_F32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F64_F32 class methods ---
-
-    Inst_VOP3__V_CVT_F64_F32::Inst_VOP3__V_CVT_F64_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f64_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CVT_F64_F32
-
-    Inst_VOP3__V_CVT_F64_F32::~Inst_VOP3__V_CVT_F64_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_F64_F32
-
-    // --- description from .arch file ---
-    // D.d = (double)S0.f.
-    void
-    Inst_VOP3__V_CVT_F64_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF64)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F32_UBYTE0 class methods ---
-
-    Inst_VOP3__V_CVT_F32_UBYTE0::Inst_VOP3__V_CVT_F32_UBYTE0(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte0", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F32_UBYTE0
-
-    Inst_VOP3__V_CVT_F32_UBYTE0::~Inst_VOP3__V_CVT_F32_UBYTE0()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_UBYTE0
-
-    // --- description from .arch file ---
-    // D.f = (float)(S0.u[7:0]).
-    void
-    Inst_VOP3__V_CVT_F32_UBYTE0::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)bits(src[lane], 7, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F32_UBYTE1 class methods ---
-
-    Inst_VOP3__V_CVT_F32_UBYTE1::Inst_VOP3__V_CVT_F32_UBYTE1(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte1", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F32_UBYTE1
-
-    Inst_VOP3__V_CVT_F32_UBYTE1::~Inst_VOP3__V_CVT_F32_UBYTE1()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_UBYTE1
-
-    // --- description from .arch file ---
-    // D.f = (float)(S0.u[15:8]).
-    void
-    Inst_VOP3__V_CVT_F32_UBYTE1::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)bits(src[lane], 15, 8);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F32_UBYTE2 class methods ---
-
-    Inst_VOP3__V_CVT_F32_UBYTE2::Inst_VOP3__V_CVT_F32_UBYTE2(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte2", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F32_UBYTE2
-
-    Inst_VOP3__V_CVT_F32_UBYTE2::~Inst_VOP3__V_CVT_F32_UBYTE2()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_UBYTE2
-
-    // --- description from .arch file ---
-    // D.f = (float)(S0.u[23:16]).
-    void
-    Inst_VOP3__V_CVT_F32_UBYTE2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)bits(src[lane], 23, 16);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F32_UBYTE3 class methods ---
-
-    Inst_VOP3__V_CVT_F32_UBYTE3::Inst_VOP3__V_CVT_F32_UBYTE3(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte3", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F32_UBYTE3
-
-    Inst_VOP3__V_CVT_F32_UBYTE3::~Inst_VOP3__V_CVT_F32_UBYTE3()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_UBYTE3
-
-    // --- description from .arch file ---
-    // D.f = (float)(S0.u[31:24]).
-    void
-    Inst_VOP3__V_CVT_F32_UBYTE3::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)bits(src[lane], 31, 24);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_U32_F64 class methods ---
-
-    Inst_VOP3__V_CVT_U32_F64::Inst_VOP3__V_CVT_U32_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_u32_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CVT_U32_F64
-
-    Inst_VOP3__V_CVT_U32_F64::~Inst_VOP3__V_CVT_U32_F64()
-    {
-    } // ~Inst_VOP3__V_CVT_U32_F64
-
-    // --- description from .arch file ---
-    // D.u = (unsigned)S0.d.
-    // Out-of-range floating point values (including infinity) saturate. NaN is
-    // ---  converted to 0.
-    void
-    Inst_VOP3__V_CVT_U32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane])) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = 0;
-                    } else {
-                        vdst[lane] = UINT_MAX;
-                    }
-                } else if (exp > 31) {
-                    vdst[lane] = UINT_MAX;
-                } else {
-                    vdst[lane] = (VecElemU32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F64_U32 class methods ---
-
-    Inst_VOP3__V_CVT_F64_U32::Inst_VOP3__V_CVT_F64_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f64_u32", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CVT_F64_U32
-
-    Inst_VOP3__V_CVT_F64_U32::~Inst_VOP3__V_CVT_F64_U32()
-    {
-    } // ~Inst_VOP3__V_CVT_F64_U32
-
-    // --- description from .arch file ---
-    // D.d = (double)S0.u.
-    void
-    Inst_VOP3__V_CVT_F64_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF64)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_TRUNC_F64 class methods ---
-
-    Inst_VOP3__V_TRUNC_F64::Inst_VOP3__V_TRUNC_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_trunc_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_TRUNC_F64
-
-    Inst_VOP3__V_TRUNC_F64::~Inst_VOP3__V_TRUNC_F64()
-    {
-    } // ~Inst_VOP3__V_TRUNC_F64
-
-    // --- description from .arch file ---
-    // D.d = trunc(S0.d), return integer part of S0.d.
-    void
-    Inst_VOP3__V_TRUNC_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::trunc(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CEIL_F64 class methods ---
-
-    Inst_VOP3__V_CEIL_F64::Inst_VOP3__V_CEIL_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ceil_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CEIL_F64
-
-    Inst_VOP3__V_CEIL_F64::~Inst_VOP3__V_CEIL_F64()
-    {
-    } // ~Inst_VOP3__V_CEIL_F64
-
-    // --- description from .arch file ---
-    // D.d = trunc(S0.d);
-    // if(S0.d > 0.0 && S0.d != D.d) then D.d += 1.0.
-    void
-    Inst_VOP3__V_CEIL_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::ceil(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_RNDNE_F64 class methods ---
-
-    Inst_VOP3__V_RNDNE_F64::Inst_VOP3__V_RNDNE_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rndne_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_RNDNE_F64
-
-    Inst_VOP3__V_RNDNE_F64::~Inst_VOP3__V_RNDNE_F64()
-    {
-    } // ~Inst_VOP3__V_RNDNE_F64
-
-    // --- description from .arch file ---
-    // D.d = round_nearest_even(S0.d).
-    void
-    Inst_VOP3__V_RNDNE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = roundNearestEven(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FLOOR_F64 class methods ---
-
-    Inst_VOP3__V_FLOOR_F64::Inst_VOP3__V_FLOOR_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_floor_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_FLOOR_F64
-
-    Inst_VOP3__V_FLOOR_F64::~Inst_VOP3__V_FLOOR_F64()
-    {
-    } // ~Inst_VOP3__V_FLOOR_F64
-
-    // --- description from .arch file ---
-    // D.d = trunc(S0.d);
-    // if(S0.d < 0.0 && S0.d != D.d) then D.d += -1.0.
-    void
-    Inst_VOP3__V_FLOOR_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::floor(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FRACT_F32 class methods ---
-
-    Inst_VOP3__V_FRACT_F32::Inst_VOP3__V_FRACT_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_fract_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_FRACT_F32
-
-    Inst_VOP3__V_FRACT_F32::~Inst_VOP3__V_FRACT_F32()
-    {
-    } // ~Inst_VOP3__V_FRACT_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f - floor(S0.f).
-    void
-    Inst_VOP3__V_FRACT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemF32 int_part(0.0);
-                vdst[lane] = std::modf(src[lane], &int_part);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_TRUNC_F32 class methods ---
-
-    Inst_VOP3__V_TRUNC_F32::Inst_VOP3__V_TRUNC_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_trunc_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_TRUNC_F32
-
-    Inst_VOP3__V_TRUNC_F32::~Inst_VOP3__V_TRUNC_F32()
-    {
-    } // ~Inst_VOP3__V_TRUNC_F32
-
-    // --- description from .arch file ---
-    // D.f = trunc(S0.f), return integer part of S0.f.
-    void
-    Inst_VOP3__V_TRUNC_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::trunc(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CEIL_F32 class methods ---
-
-    Inst_VOP3__V_CEIL_F32::Inst_VOP3__V_CEIL_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ceil_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CEIL_F32
-
-    Inst_VOP3__V_CEIL_F32::~Inst_VOP3__V_CEIL_F32()
-    {
-    } // ~Inst_VOP3__V_CEIL_F32
-
-    // --- description from .arch file ---
-    // D.f = trunc(S0.f);
-    // if(S0.f > 0.0 && S0.f != D.f) then D.f += 1.0.
-    void
-    Inst_VOP3__V_CEIL_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::ceil(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_RNDNE_F32 class methods ---
-
-    Inst_VOP3__V_RNDNE_F32::Inst_VOP3__V_RNDNE_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rndne_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_RNDNE_F32
-
-    Inst_VOP3__V_RNDNE_F32::~Inst_VOP3__V_RNDNE_F32()
-    {
-    } // ~Inst_VOP3__V_RNDNE_F32
-
-    // --- description from .arch file ---
-    // D.f = round_nearest_even(S0.f).
-    void
-    Inst_VOP3__V_RNDNE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = roundNearestEven(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FLOOR_F32 class methods ---
-
-    Inst_VOP3__V_FLOOR_F32::Inst_VOP3__V_FLOOR_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_floor_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_FLOOR_F32
-
-    Inst_VOP3__V_FLOOR_F32::~Inst_VOP3__V_FLOOR_F32()
-    {
-    } // ~Inst_VOP3__V_FLOOR_F32
-
-    // --- description from .arch file ---
-    // D.f = trunc(S0.f);
-    // if(S0.f < 0.0 && S0.f != D.f) then D.f += -1.0.
-    void
-    Inst_VOP3__V_FLOOR_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::floor(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_EXP_F32 class methods ---
-
-    Inst_VOP3__V_EXP_F32::Inst_VOP3__V_EXP_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_exp_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_EXP_F32
-
-    Inst_VOP3__V_EXP_F32::~Inst_VOP3__V_EXP_F32()
-    {
-    } // ~Inst_VOP3__V_EXP_F32
-
-    // --- description from .arch file ---
-    // D.f = pow(2.0, S0.f).
-    void
-    Inst_VOP3__V_EXP_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::pow(2.0, src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LOG_F32 class methods ---
-
-    Inst_VOP3__V_LOG_F32::Inst_VOP3__V_LOG_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_log_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_LOG_F32
-
-    Inst_VOP3__V_LOG_F32::~Inst_VOP3__V_LOG_F32()
-    {
-    } // ~Inst_VOP3__V_LOG_F32
-
-    // --- description from .arch file ---
-    // D.f = log2(S0.f). Base 2 logarithm.
-    void
-    Inst_VOP3__V_LOG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::log2(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_RCP_F32 class methods ---
-
-    Inst_VOP3__V_RCP_F32::Inst_VOP3__V_RCP_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rcp_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_RCP_F32
-
-    Inst_VOP3__V_RCP_F32::~Inst_VOP3__V_RCP_F32()
-    {
-    } // ~Inst_VOP3__V_RCP_F32
-
-    // --- description from .arch file ---
-    // D.f = 1.0 / S0.f. Reciprocal with IEEE rules and < 1ulp error.
-    void
-    Inst_VOP3__V_RCP_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = 1.0 / src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_RCP_IFLAG_F32 class methods ---
-
-    Inst_VOP3__V_RCP_IFLAG_F32::Inst_VOP3__V_RCP_IFLAG_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rcp_iflag_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_RCP_IFLAG_F32
-
-    Inst_VOP3__V_RCP_IFLAG_F32::~Inst_VOP3__V_RCP_IFLAG_F32()
-    {
-    } // ~Inst_VOP3__V_RCP_IFLAG_F32
-
-    // --- description from .arch file ---
-    // D.f = 1.0 / S0.f. Reciprocal intended for integer division, can raise
-    // ---  integer DIV_BY_ZERO exception but cannot raise floating-point
-    // ---  exceptions.
-    void
-    Inst_VOP3__V_RCP_IFLAG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = 1.0 / src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_RSQ_F32 class methods ---
-
-    Inst_VOP3__V_RSQ_F32::Inst_VOP3__V_RSQ_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rsq_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_RSQ_F32
-
-    Inst_VOP3__V_RSQ_F32::~Inst_VOP3__V_RSQ_F32()
-    {
-    } // ~Inst_VOP3__V_RSQ_F32
-
-    // --- description from .arch file ---
-    // D.f = 1.0 / sqrt(S0.f). Reciprocal square root with IEEE rules.
-    void
-    Inst_VOP3__V_RSQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = 1.0 / std::sqrt(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_RCP_F64 class methods ---
-
-    Inst_VOP3__V_RCP_F64::Inst_VOP3__V_RCP_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rcp_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_RCP_F64
-
-    Inst_VOP3__V_RCP_F64::~Inst_VOP3__V_RCP_F64()
-    {
-    } // ~Inst_VOP3__V_RCP_F64
-
-    // --- description from .arch file ---
-    // D.d = 1.0 / S0.d.
-    void
-    Inst_VOP3__V_RCP_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::fpclassify(src[lane]) == FP_ZERO) {
-                    vdst[lane] = +INFINITY;
-                } else if (std::isnan(src[lane])) {
-                    vdst[lane] = NAN;
-                } else if (std::isinf(src[lane])) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = -0.0;
-                    } else {
-                        vdst[lane] = 0.0;
-                    }
-                } else {
-                    vdst[lane] = 1.0 / src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_RSQ_F64 class methods ---
-
-    Inst_VOP3__V_RSQ_F64::Inst_VOP3__V_RSQ_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rsq_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_RSQ_F64
-
-    Inst_VOP3__V_RSQ_F64::~Inst_VOP3__V_RSQ_F64()
-    {
-    } // ~Inst_VOP3__V_RSQ_F64
-
-    // --- description from .arch file ---
-    // D.d = 1.0 / sqrt(S0.d). See V_RSQ_F32.
-    void
-    Inst_VOP3__V_RSQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::fpclassify(src[lane]) == FP_ZERO) {
-                    vdst[lane] = +INFINITY;
-                } else if (std::isnan(src[lane])) {
-                    vdst[lane] = NAN;
-                } else if (std::isinf(src[lane]) && !std::signbit(src[lane])) {
-                    vdst[lane] = 0.0;
-                } else if (std::signbit(src[lane])) {
-                    vdst[lane] = NAN;
-                } else {
-                    vdst[lane] = 1.0 / std::sqrt(src[lane]);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SQRT_F32 class methods ---
-
-    Inst_VOP3__V_SQRT_F32::Inst_VOP3__V_SQRT_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sqrt_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_SQRT_F32
-
-    Inst_VOP3__V_SQRT_F32::~Inst_VOP3__V_SQRT_F32()
-    {
-    } // ~Inst_VOP3__V_SQRT_F32
-
-    // --- description from .arch file ---
-    // D.f = sqrt(S0.f).
-    void
-    Inst_VOP3__V_SQRT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::sqrt(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SQRT_F64 class methods ---
-
-    Inst_VOP3__V_SQRT_F64::Inst_VOP3__V_SQRT_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sqrt_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_SQRT_F64
-
-    Inst_VOP3__V_SQRT_F64::~Inst_VOP3__V_SQRT_F64()
-    {
-    } // ~Inst_VOP3__V_SQRT_F64
-
-    // --- description from .arch file ---
-    // D.d = sqrt(S0.d).
-    void
-    Inst_VOP3__V_SQRT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::sqrt(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SIN_F32 class methods ---
-
-    Inst_VOP3__V_SIN_F32::Inst_VOP3__V_SIN_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sin_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_SIN_F32
-
-    Inst_VOP3__V_SIN_F32::~Inst_VOP3__V_SIN_F32()
-    {
-    } // ~Inst_VOP3__V_SIN_F32
-
-    // --- description from .arch file ---
-    // D.f = sin(S0.f * 2 * PI).
-    // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
-    // float 0.0.
-    void
-    Inst_VOP3__V_SIN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-        pi.read();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::sin(src[lane] * 2 * pi.rawData());
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_COS_F32 class methods ---
-
-    Inst_VOP3__V_COS_F32::Inst_VOP3__V_COS_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cos_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_COS_F32
-
-    Inst_VOP3__V_COS_F32::~Inst_VOP3__V_COS_F32()
-    {
-    } // ~Inst_VOP3__V_COS_F32
-
-    // --- description from .arch file ---
-    // D.f = cos(S0.f * 2 * PI).
-    // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
-    // float 1.0.
-    void
-    Inst_VOP3__V_COS_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-        pi.read();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::cos(src[lane] * 2 * pi.rawData());
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_NOT_B32 class methods ---
-
-    Inst_VOP3__V_NOT_B32::Inst_VOP3__V_NOT_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_not_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_NOT_B32
-
-    Inst_VOP3__V_NOT_B32::~Inst_VOP3__V_NOT_B32()
-    {
-    } // ~Inst_VOP3__V_NOT_B32
-
-    // --- description from .arch file ---
-    // D.u = ~S0.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP3__V_NOT_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = ~src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_BFREV_B32 class methods ---
-
-    Inst_VOP3__V_BFREV_B32::Inst_VOP3__V_BFREV_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_bfrev_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_BFREV_B32
-
-    Inst_VOP3__V_BFREV_B32::~Inst_VOP3__V_BFREV_B32()
-    {
-    } // ~Inst_VOP3__V_BFREV_B32
-
-    // --- description from .arch file ---
-    // D.u[31:0] = S0.u[0:31], bitfield reverse.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP3__V_BFREV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = reverseBits(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FFBH_U32 class methods ---
-
-    Inst_VOP3__V_FFBH_U32::Inst_VOP3__V_FFBH_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ffbh_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_FFBH_U32
-
-    Inst_VOP3__V_FFBH_U32::~Inst_VOP3__V_FFBH_U32()
-    {
-    } // ~Inst_VOP3__V_FFBH_U32
-
-    // --- description from .arch file ---
-    // D.u = position of first 1 in S0.u from MSB;
-    // D.u = 0xffffffff if S0.u == 0.
-    void
-    Inst_VOP3__V_FFBH_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = findFirstOneMsb(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FFBL_B32 class methods ---
-
-    Inst_VOP3__V_FFBL_B32::Inst_VOP3__V_FFBL_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ffbl_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_FFBL_B32
-
-    Inst_VOP3__V_FFBL_B32::~Inst_VOP3__V_FFBL_B32()
-    {
-    } // ~Inst_VOP3__V_FFBL_B32
-
-    // --- description from .arch file ---
-    // D.u = position of first 1 in S0.u from LSB;
-    // D.u = 0xffffffff if S0.u == 0.
-    void
-    Inst_VOP3__V_FFBL_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = findFirstOne(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FFBH_I32 class methods ---
-
-    Inst_VOP3__V_FFBH_I32::Inst_VOP3__V_FFBH_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ffbh_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_FFBH_I32
-
-    Inst_VOP3__V_FFBH_I32::~Inst_VOP3__V_FFBH_I32()
-    {
-    } // ~Inst_VOP3__V_FFBH_I32
-
-    // --- description from .arch file ---
-    // D.u = position of first bit different from sign bit in S0.i from MSB;
-    // D.u = 0xffffffff if S0.i == 0 or S0.i == 0xffffffff.
-    void
-    Inst_VOP3__V_FFBH_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = firstOppositeSignBit(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FREXP_EXP_I32_F64 class methods ---
-
-    Inst_VOP3__V_FREXP_EXP_I32_F64::Inst_VOP3__V_FREXP_EXP_I32_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_frexp_exp_i32_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_FREXP_EXP_I32_F64
-
-    Inst_VOP3__V_FREXP_EXP_I32_F64::~Inst_VOP3__V_FREXP_EXP_I32_F64()
-    {
-    } // ~Inst_VOP3__V_FREXP_EXP_I32_F64
-
-    // --- description from .arch file ---
-    // See V_FREXP_EXP_I32_F32.
-    void
-    Inst_VOP3__V_FREXP_EXP_I32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else {
-                    VecElemI32 exp(0);
-                    std::frexp(src[lane], &exp);
-                    vdst[lane] = exp;
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FREXP_MANT_F64 class methods ---
-
-    Inst_VOP3__V_FREXP_MANT_F64::Inst_VOP3__V_FREXP_MANT_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_frexp_mant_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_FREXP_MANT_F64
-
-    Inst_VOP3__V_FREXP_MANT_F64::~Inst_VOP3__V_FREXP_MANT_F64()
-    {
-    } // ~Inst_VOP3__V_FREXP_MANT_F64
-
-    // --- description from .arch file ---
-    // See V_FREXP_MANT_F32.
-    void
-    Inst_VOP3__V_FREXP_MANT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI32 exp(0);
-                vdst[lane] = std::frexp(src[lane], &exp);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FRACT_F64 class methods ---
-
-    Inst_VOP3__V_FRACT_F64::Inst_VOP3__V_FRACT_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_fract_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_FRACT_F64
-
-    Inst_VOP3__V_FRACT_F64::~Inst_VOP3__V_FRACT_F64()
-    {
-    } // ~Inst_VOP3__V_FRACT_F64
-
-    // --- description from .arch file ---
-    // See V_FRACT_F32.
-    void
-    Inst_VOP3__V_FRACT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemF32 int_part(0.0);
-                vdst[lane] = std::modf(src[lane], &int_part);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FREXP_EXP_I32_F32 class methods ---
-
-    Inst_VOP3__V_FREXP_EXP_I32_F32::Inst_VOP3__V_FREXP_EXP_I32_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_frexp_exp_i32_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_FREXP_EXP_I32_F32
-
-    Inst_VOP3__V_FREXP_EXP_I32_F32::~Inst_VOP3__V_FREXP_EXP_I32_F32()
-    {
-    } // ~Inst_VOP3__V_FREXP_EXP_I32_F32
-
-    // --- description from .arch file ---
-    // if(S0.f == INF || S0.f == NAN) then D.i = 0;
-    // else D.i = TwosComplement(Exponent(S0.f) - 127 + 1).
-    // Returns exponent of single precision float input, such that S0.f =
-    // significand * (2 ** exponent). See also FREXP_MANT_F32, which returns
-    // the significand.
-    void
-    Inst_VOP3__V_FREXP_EXP_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isinf(src[lane])|| std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else {
-                    VecElemI32 exp(0);
-                    std::frexp(src[lane], &exp);
-                    vdst[lane] = exp;
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FREXP_MANT_F32 class methods ---
-
-    Inst_VOP3__V_FREXP_MANT_F32::Inst_VOP3__V_FREXP_MANT_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_frexp_mant_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_FREXP_MANT_F32
-
-    Inst_VOP3__V_FREXP_MANT_F32::~Inst_VOP3__V_FREXP_MANT_F32()
-    {
-    } // ~Inst_VOP3__V_FREXP_MANT_F32
-
-    // --- description from .arch file ---
-    // if(S0.f == INF || S0.f == NAN) then D.f = S0.f;
-    // else D.f = Mantissa(S0.f).
-    // Result range is in (-1.0,-0.5][0.5,1.0) in normal cases. Returns binary
-    // ---  significand of single precision float input, such that S0.f =
-    // ---  significand * (2 ** exponent). See also FREXP_EXP_I32_F32, which
-    // ---  returns integer exponent.
-    void
-    Inst_VOP3__V_FREXP_MANT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
-                    vdst[lane] = src[lane];
-                } else {
-                    VecElemI32 exp(0);
-                    vdst[lane] = std::frexp(src[lane], &exp);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CLREXCP class methods ---
-
-    Inst_VOP3__V_CLREXCP::Inst_VOP3__V_CLREXCP(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_clrexcp", false)
-    {
-    } // Inst_VOP3__V_CLREXCP
-
-    Inst_VOP3__V_CLREXCP::~Inst_VOP3__V_CLREXCP()
-    {
-    } // ~Inst_VOP3__V_CLREXCP
-
-    // --- description from .arch file ---
-    // Clear wave's exception state in SIMD (SP).
-    void
-    Inst_VOP3__V_CLREXCP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F16_U16 class methods ---
-
-    Inst_VOP3__V_CVT_F16_U16::Inst_VOP3__V_CVT_F16_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f16_u16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CVT_F16_U16
-
-    Inst_VOP3__V_CVT_F16_U16::~Inst_VOP3__V_CVT_F16_U16()
-    {
-    } // ~Inst_VOP3__V_CVT_F16_U16
-
-    // --- description from .arch file ---
-    // D.f16 = uint16_to_flt16(S.u16).
-    // Supports denormals, rounding, exception flags and saturation.
-    void
-    Inst_VOP3__V_CVT_F16_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F16_I16 class methods ---
-
-    Inst_VOP3__V_CVT_F16_I16::Inst_VOP3__V_CVT_F16_I16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f16_i16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CVT_F16_I16
-
-    Inst_VOP3__V_CVT_F16_I16::~Inst_VOP3__V_CVT_F16_I16()
-    {
-    } // ~Inst_VOP3__V_CVT_F16_I16
-
-    // --- description from .arch file ---
-    // D.f16 = int16_to_flt16(S.i16).
-    // Supports denormals, rounding, exception flags and saturation.
-    void
-    Inst_VOP3__V_CVT_F16_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_U16_F16 class methods ---
-
-    Inst_VOP3__V_CVT_U16_F16::Inst_VOP3__V_CVT_U16_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_u16_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CVT_U16_F16
-
-    Inst_VOP3__V_CVT_U16_F16::~Inst_VOP3__V_CVT_U16_F16()
-    {
-    } // ~Inst_VOP3__V_CVT_U16_F16
-
-    // --- description from .arch file ---
-    // D.u16 = flt16_to_uint16(S.f16).
-    // Supports rounding, exception flags and saturation.
-    void
-    Inst_VOP3__V_CVT_U16_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_I16_F16 class methods ---
-
-    Inst_VOP3__V_CVT_I16_F16::Inst_VOP3__V_CVT_I16_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_i16_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CVT_I16_F16
-
-    Inst_VOP3__V_CVT_I16_F16::~Inst_VOP3__V_CVT_I16_F16()
-    {
-    } // ~Inst_VOP3__V_CVT_I16_F16
-
-    // --- description from .arch file ---
-    // D.i16 = flt16_to_int16(S.f16).
-    // Supports rounding, exception flags and saturation.
-    void
-    Inst_VOP3__V_CVT_I16_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_RCP_F16 class methods ---
-
-    Inst_VOP3__V_RCP_F16::Inst_VOP3__V_RCP_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rcp_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_RCP_F16
-
-    Inst_VOP3__V_RCP_F16::~Inst_VOP3__V_RCP_F16()
-    {
-    } // ~Inst_VOP3__V_RCP_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 1.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = ApproximateRecip(S0.f16).
-    void
-    Inst_VOP3__V_RCP_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_SQRT_F16 class methods ---
-
-    Inst_VOP3__V_SQRT_F16::Inst_VOP3__V_SQRT_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sqrt_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_SQRT_F16
-
-    Inst_VOP3__V_SQRT_F16::~Inst_VOP3__V_SQRT_F16()
-    {
-    } // ~Inst_VOP3__V_SQRT_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 1.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = ApproximateSqrt(S0.f16).
-    void
-    Inst_VOP3__V_SQRT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_RSQ_F16 class methods ---
-
-    Inst_VOP3__V_RSQ_F16::Inst_VOP3__V_RSQ_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rsq_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_RSQ_F16
-
-    Inst_VOP3__V_RSQ_F16::~Inst_VOP3__V_RSQ_F16()
-    {
-    } // ~Inst_VOP3__V_RSQ_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 1.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = ApproximateRecipSqrt(S0.f16).
-    void
-    Inst_VOP3__V_RSQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_LOG_F16 class methods ---
-
-    Inst_VOP3__V_LOG_F16::Inst_VOP3__V_LOG_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_log_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_LOG_F16
-
-    Inst_VOP3__V_LOG_F16::~Inst_VOP3__V_LOG_F16()
-    {
-    } // ~Inst_VOP3__V_LOG_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 1.0f)
-    //     D.f16 = 0.0f;
-    // else
-    //     D.f16 = ApproximateLog2(S0.f16).
-    void
-    Inst_VOP3__V_LOG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_EXP_F16 class methods ---
-
-    Inst_VOP3__V_EXP_F16::Inst_VOP3__V_EXP_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_exp_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_EXP_F16
-
-    Inst_VOP3__V_EXP_F16::~Inst_VOP3__V_EXP_F16()
-    {
-    } // ~Inst_VOP3__V_EXP_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 0.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = Approximate2ToX(S0.f16).
-    void
-    Inst_VOP3__V_EXP_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_FREXP_MANT_F16 class methods ---
-
-    Inst_VOP3__V_FREXP_MANT_F16::Inst_VOP3__V_FREXP_MANT_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_frexp_mant_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_FREXP_MANT_F16
-
-    Inst_VOP3__V_FREXP_MANT_F16::~Inst_VOP3__V_FREXP_MANT_F16()
-    {
-    } // ~Inst_VOP3__V_FREXP_MANT_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == +-INF || S0.f16 == NAN)
-    //     D.f16 = S0.f16;
-    // else
-    //     D.f16 = mantissa(S0.f16).
-    // Result range is (-1.0,-0.5][0.5,1.0).
-    // C math library frexp function.
-    // Returns binary significand of half precision float input, such that the
-    // original single float = significand * (2 ** exponent).
-    void
-    Inst_VOP3__V_FREXP_MANT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_FREXP_EXP_I16_F16 class methods ---
-
-    Inst_VOP3__V_FREXP_EXP_I16_F16::Inst_VOP3__V_FREXP_EXP_I16_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_frexp_exp_i16_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_FREXP_EXP_I16_F16
-
-    Inst_VOP3__V_FREXP_EXP_I16_F16::~Inst_VOP3__V_FREXP_EXP_I16_F16()
-    {
-    } // ~Inst_VOP3__V_FREXP_EXP_I16_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == +-INF || S0.f16 == NAN)
-    //     D.i16 = 0;
-    // else
-    //     D.i16 = 2s_complement(exponent(S0.f16) - 15 + 1).
-    // C math library frexp function.
-    // Returns exponent of half precision float input, such that the
-    // original single float = significand * (2 ** exponent).
-    void
-    Inst_VOP3__V_FREXP_EXP_I16_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_FLOOR_F16 class methods ---
-
-    Inst_VOP3__V_FLOOR_F16::Inst_VOP3__V_FLOOR_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_floor_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_FLOOR_F16
-
-    Inst_VOP3__V_FLOOR_F16::~Inst_VOP3__V_FLOOR_F16()
-    {
-    } // ~Inst_VOP3__V_FLOOR_F16
-
-    // --- description from .arch file ---
-    // D.f16 = trunc(S0.f16);
-    // if(S0.f16 < 0.0f && S0.f16 != D.f16) then D.f16 -= 1.0f.
-    void
-    Inst_VOP3__V_FLOOR_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CEIL_F16 class methods ---
-
-    Inst_VOP3__V_CEIL_F16::Inst_VOP3__V_CEIL_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ceil_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CEIL_F16
-
-    Inst_VOP3__V_CEIL_F16::~Inst_VOP3__V_CEIL_F16()
-    {
-    } // ~Inst_VOP3__V_CEIL_F16
-
-    // --- description from .arch file ---
-    // D.f16 = trunc(S0.f16);
-    // if(S0.f16 > 0.0f && S0.f16 != D.f16) then D.f16 += 1.0f.
-    void
-    Inst_VOP3__V_CEIL_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_TRUNC_F16 class methods ---
-
-    Inst_VOP3__V_TRUNC_F16::Inst_VOP3__V_TRUNC_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_trunc_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_TRUNC_F16
-
-    Inst_VOP3__V_TRUNC_F16::~Inst_VOP3__V_TRUNC_F16()
-    {
-    } // ~Inst_VOP3__V_TRUNC_F16
-
-    // --- description from .arch file ---
-    // D.f16 = trunc(S0.f16).
-    // Round-to-zero semantics.
-    void
-    Inst_VOP3__V_TRUNC_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_RNDNE_F16 class methods ---
-
-    Inst_VOP3__V_RNDNE_F16::Inst_VOP3__V_RNDNE_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rndne_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_RNDNE_F16
-
-    Inst_VOP3__V_RNDNE_F16::~Inst_VOP3__V_RNDNE_F16()
-    {
-    } // ~Inst_VOP3__V_RNDNE_F16
-
-    // --- description from .arch file ---
-    // D.f16 = FLOOR(S0.f16 + 0.5f);
-    // if(floor(S0.f16) is even && fract(S0.f16) == 0.5f) then D.f16 -= 1.0f.
-    // Round-to-nearest-even semantics.
-    void
-    Inst_VOP3__V_RNDNE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_FRACT_F16 class methods ---
-
-    Inst_VOP3__V_FRACT_F16::Inst_VOP3__V_FRACT_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_fract_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_FRACT_F16
-
-    Inst_VOP3__V_FRACT_F16::~Inst_VOP3__V_FRACT_F16()
-    {
-    } // ~Inst_VOP3__V_FRACT_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 + -floor(S0.f16).
-    void
-    Inst_VOP3__V_FRACT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_SIN_F16 class methods ---
-
-    Inst_VOP3__V_SIN_F16::Inst_VOP3__V_SIN_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sin_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_SIN_F16
-
-    Inst_VOP3__V_SIN_F16::~Inst_VOP3__V_SIN_F16()
-    {
-    } // ~Inst_VOP3__V_SIN_F16
-
-    // --- description from .arch file ---
-    // D.f16 = sin(S0.f16 * 2 * PI).
-    void
-    Inst_VOP3__V_SIN_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_COS_F16 class methods ---
-
-    Inst_VOP3__V_COS_F16::Inst_VOP3__V_COS_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cos_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_COS_F16
-
-    Inst_VOP3__V_COS_F16::~Inst_VOP3__V_COS_F16()
-    {
-    } // ~Inst_VOP3__V_COS_F16
-
-    // --- description from .arch file ---
-    // D.f16 = cos(S0.f16 * 2 * PI).
-    void
-    Inst_VOP3__V_COS_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_EXP_LEGACY_F32 class methods ---
-
-    Inst_VOP3__V_EXP_LEGACY_F32::Inst_VOP3__V_EXP_LEGACY_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_exp_legacy_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_EXP_LEGACY_F32
-
-    Inst_VOP3__V_EXP_LEGACY_F32::~Inst_VOP3__V_EXP_LEGACY_F32()
-    {
-    } // ~Inst_VOP3__V_EXP_LEGACY_F32
-
-    // --- description from .arch file ---
-    // D.f = pow(2.0, S0.f) with legacy semantics.
-    void
-    Inst_VOP3__V_EXP_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::pow(2.0, src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LOG_LEGACY_F32 class methods ---
-
-    Inst_VOP3__V_LOG_LEGACY_F32::Inst_VOP3__V_LOG_LEGACY_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_log_legacy_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_LOG_LEGACY_F32
-
-    Inst_VOP3__V_LOG_LEGACY_F32::~Inst_VOP3__V_LOG_LEGACY_F32()
-    {
-    } // ~Inst_VOP3__V_LOG_LEGACY_F32
-
-    // --- description from .arch file ---
-    // D.f = log2(S0.f). Base 2 logarithm with legacy semantics.
-    void
-    Inst_VOP3__V_LOG_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::log2(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAD_LEGACY_F32 class methods ---
-
-    Inst_VOP3__V_MAD_LEGACY_F32::Inst_VOP3__V_MAD_LEGACY_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mad_legacy_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_LEGACY_F32
-
-    Inst_VOP3__V_MAD_LEGACY_F32::~Inst_VOP3__V_MAD_LEGACY_F32()
-    {
-    } // ~Inst_VOP3__V_MAD_LEGACY_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f + S2.f (DX9 rules, 0.0 * x = 0.0).
-    void
-    Inst_VOP3__V_MAD_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAD_F32 class methods ---
-
-    Inst_VOP3__V_MAD_F32::Inst_VOP3__V_MAD_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mad_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_F32
-
-    Inst_VOP3__V_MAD_F32::~Inst_VOP3__V_MAD_F32()
-    {
-    } // ~Inst_VOP3__V_MAD_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f + S2.f.
-    void
-    Inst_VOP3__V_MAD_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAD_I32_I24 class methods ---
-
-    Inst_VOP3__V_MAD_I32_I24::Inst_VOP3__V_MAD_I32_I24(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mad_i32_i24", false)
-    {
-        setFlag(ALU);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_I32_I24
-
-    Inst_VOP3__V_MAD_I32_I24::~Inst_VOP3__V_MAD_I32_I24()
-    {
-    } // ~Inst_VOP3__V_MAD_I32_I24
-
-    // --- description from .arch file ---
-    // D.i = S0.i[23:0] * S1.i[23:0] + S2.i.
-    void
-    Inst_VOP3__V_MAD_I32_I24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
-                    * sext<24>(bits(src1[lane], 23, 0)) + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAD_U32_U24 class methods ---
-
-    Inst_VOP3__V_MAD_U32_U24::Inst_VOP3__V_MAD_U32_U24(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mad_u32_u24", false)
-    {
-        setFlag(ALU);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_U32_U24
-
-    Inst_VOP3__V_MAD_U32_U24::~Inst_VOP3__V_MAD_U32_U24()
-    {
-    } // ~Inst_VOP3__V_MAD_U32_U24
-
-    // --- description from .arch file ---
-    // D.u = S0.u[23:0] * S1.u[23:0] + S2.u.
-    void
-    Inst_VOP3__V_MAD_U32_U24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0)
-                    + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CUBEID_F32 class methods ---
-
-    Inst_VOP3__V_CUBEID_F32::Inst_VOP3__V_CUBEID_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cubeid_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CUBEID_F32
-
-    Inst_VOP3__V_CUBEID_F32::~Inst_VOP3__V_CUBEID_F32()
-    {
-    } // ~Inst_VOP3__V_CUBEID_F32
-
-    // --- description from .arch file ---
-    // D.f = cubemap face ID ({0.0, 1.0, ..., 5.0}). XYZ coordinate is given in
-    // ---  (S0.f, S1.f, S2.f).
-    void
-    Inst_VOP3__V_CUBEID_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CUBESC_F32 class methods ---
-
-    Inst_VOP3__V_CUBESC_F32::Inst_VOP3__V_CUBESC_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cubesc_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CUBESC_F32
-
-    Inst_VOP3__V_CUBESC_F32::~Inst_VOP3__V_CUBESC_F32()
-    {
-    } // ~Inst_VOP3__V_CUBESC_F32
-
-    // --- description from .arch file ---
-    // D.f = cubemap S coordinate. XYZ coordinate is given in (S0.f, S1.f,
-    // S2.f).
-    void
-    Inst_VOP3__V_CUBESC_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CUBETC_F32 class methods ---
-
-    Inst_VOP3__V_CUBETC_F32::Inst_VOP3__V_CUBETC_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cubetc_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CUBETC_F32
-
-    Inst_VOP3__V_CUBETC_F32::~Inst_VOP3__V_CUBETC_F32()
-    {
-    } // ~Inst_VOP3__V_CUBETC_F32
-
-    // --- description from .arch file ---
-    // D.f = cubemap T coordinate. XYZ coordinate is given in (S0.f, S1.f,
-    // S2.f).
-    void
-    Inst_VOP3__V_CUBETC_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CUBEMA_F32 class methods ---
-
-    Inst_VOP3__V_CUBEMA_F32::Inst_VOP3__V_CUBEMA_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cubema_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CUBEMA_F32
-
-    Inst_VOP3__V_CUBEMA_F32::~Inst_VOP3__V_CUBEMA_F32()
-    {
-    } // ~Inst_VOP3__V_CUBEMA_F32
-
-    // --- description from .arch file ---
-    // D.f = 2.0 * cubemap major axis. XYZ coordinate is given in (S0.f, S1.f,
-    // ---  S2.f).
-    void
-    Inst_VOP3__V_CUBEMA_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_BFE_U32 class methods ---
-
-    Inst_VOP3__V_BFE_U32::Inst_VOP3__V_BFE_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_bfe_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_BFE_U32
-
-    Inst_VOP3__V_BFE_U32::~Inst_VOP3__V_BFE_U32()
-    {
-    } // ~Inst_VOP3__V_BFE_U32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u>>S1.u[4:0]) & ((1<<S2.u[4:0])-1).
-    // Bitfield extract with S0 = data, S1 = field_offset, S2 = field_width.
-    void
-    Inst_VOP3__V_BFE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (src0[lane] >> bits(src1[lane], 4, 0))
-                    & ((1 << bits(src2[lane], 4, 0)) - 1);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_BFE_I32 class methods ---
-
-    Inst_VOP3__V_BFE_I32::Inst_VOP3__V_BFE_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_bfe_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_BFE_I32
-
-    Inst_VOP3__V_BFE_I32::~Inst_VOP3__V_BFE_I32()
-    {
-    } // ~Inst_VOP3__V_BFE_I32
-
-    // --- description from .arch file ---
-    // D.i = (S0.i>>S1.u[4:0]) & ((1<<S2.u[4:0])-1).
-    // Bitfield extract with S0 = data, S1 = field_offset, S2 = field_width.
-    void
-    Inst_VOP3__V_BFE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (src0[lane] >> bits(src1[lane], 4, 0))
-                    & ((1 << bits(src2[lane], 4, 0)) - 1);
-
-                // Above extracted a signed int of size src2 bits which needs
-                // to be signed-extended. Check if the MSB of our src2-bit
-                // integer is 1, and sign extend it is.
-                if (vdst[lane] >> (bits(src2[lane], 4, 0) - 1)) {
-                    vdst[lane] |= 0xffffffff << bits(src2[lane], 4, 0);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_BFI_B32 class methods ---
-
-    Inst_VOP3__V_BFI_B32::Inst_VOP3__V_BFI_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_bfi_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_BFI_B32
-
-    Inst_VOP3__V_BFI_B32::~Inst_VOP3__V_BFI_B32()
-    {
-    } // ~Inst_VOP3__V_BFI_B32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u & S1.u) | (~S0.u & S2.u); bitfield insert.
-    void
-    Inst_VOP3__V_BFI_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (src0[lane] & src1[lane]) | (~src0[lane]
-                    & src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FMA_F32 class methods ---
-
-    Inst_VOP3__V_FMA_F32::Inst_VOP3__V_FMA_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_fma_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(FMA);
-    } // Inst_VOP3__V_FMA_F32
-
-    Inst_VOP3__V_FMA_F32::~Inst_VOP3__V_FMA_F32()
-    {
-    } // ~Inst_VOP3__V_FMA_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f + S2.f.
-    void
-    Inst_VOP3__V_FMA_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FMA_F64 class methods ---
-
-    Inst_VOP3__V_FMA_F64::Inst_VOP3__V_FMA_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_fma_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(FMA);
-    } // Inst_VOP3__V_FMA_F64
-
-    Inst_VOP3__V_FMA_F64::~Inst_VOP3__V_FMA_F64()
-    {
-    } // ~Inst_VOP3__V_FMA_F64
-
-    // --- description from .arch file ---
-    // D.d = S0.d * S1.d + S2.d.
-    void
-    Inst_VOP3__V_FMA_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LERP_U8 class methods ---
-
-    Inst_VOP3__V_LERP_U8::Inst_VOP3__V_LERP_U8(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_lerp_u8", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LERP_U8
-
-    Inst_VOP3__V_LERP_U8::~Inst_VOP3__V_LERP_U8()
-    {
-    } // ~Inst_VOP3__V_LERP_U8
-
-    // --- description from .arch file ---
-    // D.u = ((S0.u[31:24] + S1.u[31:24] + S2.u[24]) >> 1) << 24
-    // D.u += ((S0.u[23:16] + S1.u[23:16] + S2.u[16]) >> 1) << 16;
-    // D.u += ((S0.u[15:8] + S1.u[15:8] + S2.u[8]) >> 1) << 8;
-    // D.u += ((S0.u[7:0] + S1.u[7:0] + S2.u[0]) >> 1).
-    // Unsigned 8-bit pixel average on packed unsigned bytes (linear
-    // ---  interpolation). S2 acts as a round mode; if set, 0.5 rounds up,
-    // ---  otherwise 0.5 truncates.
-    void
-    Inst_VOP3__V_LERP_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = ((bits(src0[lane], 31, 24)
-                    + bits(src1[lane], 31, 24) + bits(src2[lane], 24)) >> 1)
-                        << 24;
-                vdst[lane] += ((bits(src0[lane], 23, 16)
-                    + bits(src1[lane], 23, 16) + bits(src2[lane], 16)) >> 1)
-                        << 16;
-                vdst[lane] += ((bits(src0[lane], 15, 8)
-                    + bits(src1[lane], 15, 8) + bits(src2[lane], 8)) >> 1)
-                        << 8;
-                vdst[lane] += ((bits(src0[lane], 7, 0) + bits(src1[lane], 7, 0)
-                    + bits(src2[lane], 0)) >> 1);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ALIGNBIT_B32 class methods ---
-
-    Inst_VOP3__V_ALIGNBIT_B32::Inst_VOP3__V_ALIGNBIT_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_alignbit_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ALIGNBIT_B32
-
-    Inst_VOP3__V_ALIGNBIT_B32::~Inst_VOP3__V_ALIGNBIT_B32()
-    {
-    } // ~Inst_VOP3__V_ALIGNBIT_B32
-
-    // --- description from .arch file ---
-    // D.u = ({S0,S1} >> S2.u[4:0]) & 0xffffffff.
-    void
-    Inst_VOP3__V_ALIGNBIT_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemU64 src_0_1 = (((VecElemU64)src0[lane] << 32)
-                    | (VecElemU64)src1[lane]);
-                vdst[lane] = (VecElemU32)((src_0_1
-                    >> (VecElemU64)bits(src2[lane], 4, 0)) & 0xffffffff);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ALIGNBYTE_B32 class methods ---
-
-    Inst_VOP3__V_ALIGNBYTE_B32::Inst_VOP3__V_ALIGNBYTE_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_alignbyte_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ALIGNBYTE_B32
-
-    Inst_VOP3__V_ALIGNBYTE_B32::~Inst_VOP3__V_ALIGNBYTE_B32()
-    {
-    } // ~Inst_VOP3__V_ALIGNBYTE_B32
-
-    // --- description from .arch file ---
-    // D.u = ({S0,S1} >> (8*S2.u[4:0])) & 0xffffffff.
-    void
-    Inst_VOP3__V_ALIGNBYTE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemU64 src_0_1 = (((VecElemU64)src0[lane] << 32)
-                    | (VecElemU64)src1[lane]);
-                vdst[lane] = (VecElemU32)((src_0_1
-                    >> (8ULL * (VecElemU64)bits(src2[lane], 4, 0)))
-                        & 0xffffffff);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MIN3_F32 class methods ---
-
-    Inst_VOP3__V_MIN3_F32::Inst_VOP3__V_MIN3_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min3_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_MIN3_F32
-
-    Inst_VOP3__V_MIN3_F32::~Inst_VOP3__V_MIN3_F32()
-    {
-    } // ~Inst_VOP3__V_MIN3_F32
-
-    // --- description from .arch file ---
-    // D.f = min(S0.f, S1.f, S2.f).
-    void
-    Inst_VOP3__V_MIN3_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemF32 min_0_1 = std::fmin(src0[lane], src1[lane]);
-                vdst[lane] = std::fmin(min_0_1, src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MIN3_I32 class methods ---
-
-    Inst_VOP3__V_MIN3_I32::Inst_VOP3__V_MIN3_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min3_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MIN3_I32
-
-    Inst_VOP3__V_MIN3_I32::~Inst_VOP3__V_MIN3_I32()
-    {
-    } // ~Inst_VOP3__V_MIN3_I32
-
-    // --- description from .arch file ---
-    // D.i = min(S0.i, S1.i, S2.i).
-    void
-    Inst_VOP3__V_MIN3_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI32 min_0_1 = std::min(src0[lane], src1[lane]);
-                vdst[lane] = std::min(min_0_1, src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MIN3_U32 class methods ---
-
-    Inst_VOP3__V_MIN3_U32::Inst_VOP3__V_MIN3_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min3_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MIN3_U32
-
-    Inst_VOP3__V_MIN3_U32::~Inst_VOP3__V_MIN3_U32()
-    {
-    } // ~Inst_VOP3__V_MIN3_U32
-
-    // --- description from .arch file ---
-    // D.u = min(S0.u, S1.u, S2.u).
-    void
-    Inst_VOP3__V_MIN3_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemU32 min_0_1 = std::min(src0[lane], src1[lane]);
-                vdst[lane] = std::min(min_0_1, src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAX3_F32 class methods ---
-
-    Inst_VOP3__V_MAX3_F32::Inst_VOP3__V_MAX3_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max3_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_MAX3_F32
-
-    Inst_VOP3__V_MAX3_F32::~Inst_VOP3__V_MAX3_F32()
-    {
-    } // ~Inst_VOP3__V_MAX3_F32
-
-    // --- description from .arch file ---
-    // D.f = max(S0.f, S1.f, S2.f).
-    void
-    Inst_VOP3__V_MAX3_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemF32 max_0_1 = std::fmax(src0[lane], src1[lane]);
-                vdst[lane] = std::fmax(max_0_1, src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAX3_I32 class methods ---
-
-    Inst_VOP3__V_MAX3_I32::Inst_VOP3__V_MAX3_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max3_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MAX3_I32
-
-    Inst_VOP3__V_MAX3_I32::~Inst_VOP3__V_MAX3_I32()
-    {
-    } // ~Inst_VOP3__V_MAX3_I32
-
-    // --- description from .arch file ---
-    // D.i = max(S0.i, S1.i, S2.i).
-    void
-    Inst_VOP3__V_MAX3_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI32 max_0_1 = std::max(src0[lane], src1[lane]);
-                vdst[lane] = std::max(max_0_1, src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAX3_U32 class methods ---
-
-    Inst_VOP3__V_MAX3_U32::Inst_VOP3__V_MAX3_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max3_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MAX3_U32
-
-    Inst_VOP3__V_MAX3_U32::~Inst_VOP3__V_MAX3_U32()
-    {
-    } // ~Inst_VOP3__V_MAX3_U32
-
-    // --- description from .arch file ---
-    // D.u = max(S0.u, S1.u, S2.u).
-    void
-    Inst_VOP3__V_MAX3_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemU32 max_0_1 = std::max(src0[lane], src1[lane]);
-                vdst[lane] = std::max(max_0_1, src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MED3_F32 class methods ---
-
-    Inst_VOP3__V_MED3_F32::Inst_VOP3__V_MED3_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_med3_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_MED3_F32
-
-    Inst_VOP3__V_MED3_F32::~Inst_VOP3__V_MED3_F32()
-    {
-    } // ~Inst_VOP3__V_MED3_F32
-
-    // --- description from .arch file ---
-    // D.f = median(S0.f, S1.f, S2.f).
-    void
-    Inst_VOP3__V_MED3_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MED3_I32 class methods ---
-
-    Inst_VOP3__V_MED3_I32::Inst_VOP3__V_MED3_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_med3_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MED3_I32
-
-    Inst_VOP3__V_MED3_I32::~Inst_VOP3__V_MED3_I32()
-    {
-    } // ~Inst_VOP3__V_MED3_I32
-
-    // --- description from .arch file ---
-    // D.i = median(S0.i, S1.i, S2.i).
-    void
-    Inst_VOP3__V_MED3_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MED3_U32 class methods ---
-
-    Inst_VOP3__V_MED3_U32::Inst_VOP3__V_MED3_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_med3_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MED3_U32
-
-    Inst_VOP3__V_MED3_U32::~Inst_VOP3__V_MED3_U32()
-    {
-    } // ~Inst_VOP3__V_MED3_U32
-
-    // --- description from .arch file ---
-    // D.u = median(S0.u, S1.u, S2.u).
-    void
-    Inst_VOP3__V_MED3_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SAD_U8 class methods ---
-
-    Inst_VOP3__V_SAD_U8::Inst_VOP3__V_SAD_U8(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sad_u8", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SAD_U8
-
-    Inst_VOP3__V_SAD_U8::~Inst_VOP3__V_SAD_U8()
-    {
-    } // ~Inst_VOP3__V_SAD_U8
-
-    // --- description from .arch file ---
-    // D.u = abs(S0.i[31:24] - S1.i[31:24]) + abs(S0.i[23:16] - S1.i[23:16]) +
-    // abs(S0.i[15:8] - S1.i[15:8]) + abs(S0.i[7:0] - S1.i[7:0]) + S2.u.
-    // Sum of absolute differences with accumulation, overflow into upper bits
-    // is allowed.
-    void
-    Inst_VOP3__V_SAD_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::abs(bits(src0[lane], 31, 24)
-                    - bits(src1[lane], 31, 24))
-                    + std::abs(bits(src0[lane], 23, 16)
-                    - bits(src1[lane], 23, 16))
-                    + std::abs(bits(src0[lane], 15, 8)
-                    - bits(src1[lane], 15, 8))
-                    + std::abs(bits(src0[lane], 7, 0)
-                    - bits(src1[lane], 7, 0)) + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SAD_HI_U8 class methods ---
-
-    Inst_VOP3__V_SAD_HI_U8::Inst_VOP3__V_SAD_HI_U8(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sad_hi_u8", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SAD_HI_U8
-
-    Inst_VOP3__V_SAD_HI_U8::~Inst_VOP3__V_SAD_HI_U8()
-    {
-    } // ~Inst_VOP3__V_SAD_HI_U8
-
-    // --- description from .arch file ---
-    // D.u = (SAD_U8(S0, S1, 0) << 16) + S2.u.
-    // Sum of absolute differences with accumulation, overflow is lost.
-    void
-    Inst_VOP3__V_SAD_HI_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (((bits(src0[lane], 31, 24)
-                    - bits(src1[lane], 31, 24)) + (bits(src0[lane], 23, 16)
-                    - bits(src1[lane], 23, 16)) + (bits(src0[lane], 15, 8)
-                    - bits(src1[lane], 15, 8)) + (bits(src0[lane], 7, 0)
-                    - bits(src1[lane], 7, 0))) << 16) + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SAD_U16 class methods ---
-
-    Inst_VOP3__V_SAD_U16::Inst_VOP3__V_SAD_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sad_u16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SAD_U16
-
-    Inst_VOP3__V_SAD_U16::~Inst_VOP3__V_SAD_U16()
-    {
-    } // ~Inst_VOP3__V_SAD_U16
-
-    // --- description from .arch file ---
-    // D.u = abs(S0.i[31:16] - S1.i[31:16]) + abs(S0.i[15:0] - S1.i[15:0])
-    // + S2.u.
-    // Word SAD with accumulation.
-    void
-    Inst_VOP3__V_SAD_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::abs(bits(src0[lane], 31, 16)
-                    - bits(src1[lane], 31, 16))
-                    + std::abs(bits(src0[lane], 15, 0)
-                    - bits(src1[lane], 15, 0)) + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SAD_U32 class methods ---
-
-    Inst_VOP3__V_SAD_U32::Inst_VOP3__V_SAD_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sad_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SAD_U32
-
-    Inst_VOP3__V_SAD_U32::~Inst_VOP3__V_SAD_U32()
-    {
-    } // ~Inst_VOP3__V_SAD_U32
-
-    // --- description from .arch file ---
-    // D.u = abs(S0.i - S1.i) + S2.u.
-    // Dword SAD with accumulation.
-    void
-    Inst_VOP3__V_SAD_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::abs(src0[lane] - src1[lane]) + src2[lane];
-            } // if
-        } // for
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_PK_U8_F32 class methods ---
-
-    Inst_VOP3__V_CVT_PK_U8_F32::Inst_VOP3__V_CVT_PK_U8_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_pk_u8_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_PK_U8_F32
-
-    Inst_VOP3__V_CVT_PK_U8_F32::~Inst_VOP3__V_CVT_PK_U8_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_PK_U8_F32
-
-    // --- description from .arch file ---
-    // D.u = ((flt32_to_uint8(S0.f) & 0xff) << (8 * S1.u[1:0]))
-    // | (S2.u & ~(0xff << (8 * S1.u[1:0]))).
-    // Convert floating point value S0 to 8-bit unsigned integer and pack the
-    // result into byte S1 of dword S2.
-    void
-    Inst_VOP3__V_CVT_PK_U8_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (((VecElemU8)src0[lane] & 0xff)
-                    << (8 * bits(src1[lane], 1, 0)))
-                    | (src2[lane] & ~(0xff << (8 * bits(src1[lane], 1, 0))));
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_DIV_FIXUP_F32 class methods ---
-
-    Inst_VOP3__V_DIV_FIXUP_F32::Inst_VOP3__V_DIV_FIXUP_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_div_fixup_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_DIV_FIXUP_F32
-
-    Inst_VOP3__V_DIV_FIXUP_F32::~Inst_VOP3__V_DIV_FIXUP_F32()
-    {
-    } // ~Inst_VOP3__V_DIV_FIXUP_F32
-
-    // --- description from .arch file ---
-    // D.f = Divide fixup and flags -- s0.f = Quotient, s1.f = Denominator,
-    // s2.f = Numerator. This opcode generates exceptions resulting from the
-    // division operation.
-    void
-    Inst_VOP3__V_DIV_FIXUP_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::fpclassify(src1[lane]) == FP_ZERO) {
-                    if (std::signbit(src1[lane])) {
-                        vdst[lane] = -INFINITY;
-                    } else {
-                        vdst[lane] = +INFINITY;
-                    }
-                } else if (std::isnan(src2[lane]) || std::isnan(src1[lane])) {
-                    vdst[lane] = NAN;
-                } else if (std::isinf(src1[lane])) {
-                    if (std::signbit(src1[lane])) {
-                        vdst[lane] = -INFINITY;
-                    } else {
-                        vdst[lane] = +INFINITY;
-                    }
-                } else {
-                    vdst[lane] = src2[lane] / src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_DIV_FIXUP_F64 class methods ---
-
-    Inst_VOP3__V_DIV_FIXUP_F64::Inst_VOP3__V_DIV_FIXUP_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_div_fixup_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_DIV_FIXUP_F64
-
-    Inst_VOP3__V_DIV_FIXUP_F64::~Inst_VOP3__V_DIV_FIXUP_F64()
-    {
-    } // ~Inst_VOP3__V_DIV_FIXUP_F64
-
-    // --- description from .arch file ---
-    // D.d = Divide fixup and flags -- s0.d = Quotient, s1.d = Denominator,
-    // s2.d = Numerator. This opcode generates exceptions resulting from the
-    // division operation.
-    void
-    Inst_VOP3__V_DIV_FIXUP_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int sign_out = std::signbit(src1[lane])
-                              ^ std::signbit(src2[lane]);
-                int exp1(0);
-                int exp2(0);
-                std::frexp(src1[lane], &exp1);
-                std::frexp(src2[lane], &exp2);
-
-                if (std::isnan(src1[lane]) || std::isnan(src2[lane])) {
-                    vdst[lane] = std::numeric_limits<VecElemF64>::quiet_NaN();
-                } else if (std::fpclassify(src1[lane]) == FP_ZERO
-                           && std::fpclassify(src2[lane]) == FP_ZERO) {
-                    vdst[lane]
-                        = std::numeric_limits<VecElemF64>::signaling_NaN();
-                } else if (std::isinf(src1[lane]) && std::isinf(src2[lane])) {
-                    vdst[lane]
-                        = std::numeric_limits<VecElemF64>::signaling_NaN();
-                } else if (std::fpclassify(src1[lane]) == FP_ZERO
-                           || std::isinf(src2[lane])) {
-                    vdst[lane] = sign_out ? -INFINITY : +INFINITY;
-                } else if (std::isinf(src1[lane])
-                           || std::fpclassify(src2[lane]) == FP_ZERO) {
-                    vdst[lane] = sign_out ? -0.0 : +0.0;
-                } else if (exp2 - exp1 < -1075) {
-                    vdst[lane] = src0[lane];
-                } else if (exp1 == 2047) {
-                    vdst[lane] = src0[lane];
-                } else {
-                    vdst[lane] = sign_out ? -std::fabs(src0[lane])
-                        : std::fabs(src0[lane]);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_DIV_SCALE_F32 class methods ---
-
-    Inst_VOP3__V_DIV_SCALE_F32::Inst_VOP3__V_DIV_SCALE_F32(
-          InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_div_scale_f32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(F32);
-    } // Inst_VOP3__V_DIV_SCALE_F32
-
-    Inst_VOP3__V_DIV_SCALE_F32::~Inst_VOP3__V_DIV_SCALE_F32()
-    {
-    } // ~Inst_VOP3__V_DIV_SCALE_F32
-
-    // --- description from .arch file ---
-    // {vcc,D.f} = Divide preop and flags -- s0.f = Quotient, s1.f =
-    // Denominator, s2.f = Numerator -- s0 must equal s1 or s2. Given a
-    // numerator and denominator, this opcode will appropriately scale inputs
-    // for division to avoid subnormal terms during Newton-Raphson correction
-    // algorithm. This opcode producses a VCC flag for post-scale of quotient.
-    void
-    Inst_VOP3__V_DIV_SCALE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane];
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_DIV_SCALE_F64 class methods ---
-
-    Inst_VOP3__V_DIV_SCALE_F64::Inst_VOP3__V_DIV_SCALE_F64(
-          InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_div_scale_f64")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(F64);
-    } // Inst_VOP3__V_DIV_SCALE_F64
-
-    Inst_VOP3__V_DIV_SCALE_F64::~Inst_VOP3__V_DIV_SCALE_F64()
-    {
-    } // ~Inst_VOP3__V_DIV_SCALE_F64
-
-    // --- description from .arch file ---
-    // {vcc,D.d} = Divide preop and flags -- s0.d = Quotient, s1.d =
-    // Denominator, s2.d = Numerator -- s0 must equal s1 or s2. Given a
-    // numerator and denominator, this opcode will appropriately scale inputs
-    // for division to avoid subnormal terms during Newton-Raphson correction
-    // algorithm. This opcode producses a VCC flag for post-scale of quotient.
-    void
-    Inst_VOP3__V_DIV_SCALE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
-        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp1(0);
-                int exp2(0);
-                std::frexp(src1[lane], &exp1);
-                std::frexp(src2[lane], &exp2);
-                vcc.setBit(lane, 0);
-
-                if (std::fpclassify(src1[lane]) == FP_ZERO
-                    || std::fpclassify(src2[lane]) == FP_ZERO) {
-                    vdst[lane] = NAN;
-                } else if (exp2 - exp1 >= 768) {
-                    vcc.setBit(lane, 1);
-                    if (src0[lane] == src1[lane]) {
-                        vdst[lane] = std::ldexp(src0[lane], 128);
-                    }
-                } else if (std::fpclassify(src1[lane]) == FP_SUBNORMAL) {
-                    vdst[lane] = std::ldexp(src0[lane], 128);
-                } else if (std::fpclassify(1.0 / src1[lane]) == FP_SUBNORMAL
-                           && std::fpclassify(src2[lane] / src1[lane])
-                           == FP_SUBNORMAL) {
-                    vcc.setBit(lane, 1);
-                    if (src0[lane] == src1[lane]) {
-                        vdst[lane] = std::ldexp(src0[lane], 128);
-                    }
-                } else if (std::fpclassify(1.0 / src1[lane]) == FP_SUBNORMAL) {
-                    vdst[lane] = std::ldexp(src0[lane], -128);
-                } else if (std::fpclassify(src2[lane] / src1[lane])
-                           == FP_SUBNORMAL) {
-                    vcc.setBit(lane, 1);
-                    if (src0[lane] == src2[lane]) {
-                        vdst[lane] = std::ldexp(src0[lane], 128);
-                    }
-                } else if (exp2 <= 53) {
-                    vdst[lane] = std::ldexp(src0[lane], 128);
-                }
-            }
-        }
-
-        vcc.write();
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_DIV_FMAS_F32 class methods ---
-
-    Inst_VOP3__V_DIV_FMAS_F32::Inst_VOP3__V_DIV_FMAS_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_div_fmas_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(ReadsVCC);
-        setFlag(F32);
-        setFlag(FMA);
-    } // Inst_VOP3__V_DIV_FMAS_F32
-
-    Inst_VOP3__V_DIV_FMAS_F32::~Inst_VOP3__V_DIV_FMAS_F32()
-    {
-    } // ~Inst_VOP3__V_DIV_FMAS_F32
-
-    // --- description from .arch file ---
-    // D.f = Special case divide FMA with scale and flags(s0.f = Quotient,
-    // s1.f = Denominator, s2.f = Numerator)
-    void
-    Inst_VOP3__V_DIV_FMAS_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        //vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_DIV_FMAS_F64 class methods ---
-
-    Inst_VOP3__V_DIV_FMAS_F64::Inst_VOP3__V_DIV_FMAS_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_div_fmas_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(ReadsVCC);
-        setFlag(F64);
-        setFlag(FMA);
-    } // Inst_VOP3__V_DIV_FMAS_F64
-
-    Inst_VOP3__V_DIV_FMAS_F64::~Inst_VOP3__V_DIV_FMAS_F64()
-    {
-    } // ~Inst_VOP3__V_DIV_FMAS_F64
-
-    // --- description from .arch file ---
-    // D.d = Special case divide FMA with scale and flags(s0.d = Quotient,
-    // s1.d = Denominator, s2.d = Numerator)
-    void
-    Inst_VOP3__V_DIV_FMAS_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-        vcc.read();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(vcc.rawData(), lane)) {
-                    vdst[lane] = std::pow(2, 64)
-                        * std::fma(src0[lane], src1[lane], src2[lane]);
-                } else {
-                    vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MSAD_U8 class methods ---
-
-    Inst_VOP3__V_MSAD_U8::Inst_VOP3__V_MSAD_U8(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_msad_u8", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MSAD_U8
-
-    Inst_VOP3__V_MSAD_U8::~Inst_VOP3__V_MSAD_U8()
-    {
-    } // ~Inst_VOP3__V_MSAD_U8
-
-    // --- description from .arch file ---
-    // D.u = Masked Byte SAD with accum_lo(S0.u, S1.u, S2.u).
-    void
-    Inst_VOP3__V_MSAD_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_QSAD_PK_U16_U8 class methods ---
-
-    Inst_VOP3__V_QSAD_PK_U16_U8::Inst_VOP3__V_QSAD_PK_U16_U8(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_qsad_pk_u16_u8", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_QSAD_PK_U16_U8
-
-    Inst_VOP3__V_QSAD_PK_U16_U8::~Inst_VOP3__V_QSAD_PK_U16_U8()
-    {
-    } // ~Inst_VOP3__V_QSAD_PK_U16_U8
-
-    // --- description from .arch file ---
-    // D.u = Quad-Byte SAD with 16-bit packed accum_lo/hi(S0.u[63:0],
-    // S1.u[31:0], S2.u[63:0])
-    void
-    Inst_VOP3__V_QSAD_PK_U16_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_MQSAD_PK_U16_U8 class methods ---
-
-    Inst_VOP3__V_MQSAD_PK_U16_U8::Inst_VOP3__V_MQSAD_PK_U16_U8(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mqsad_pk_u16_u8", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MQSAD_PK_U16_U8
-
-    Inst_VOP3__V_MQSAD_PK_U16_U8::~Inst_VOP3__V_MQSAD_PK_U16_U8()
-    {
-    } // ~Inst_VOP3__V_MQSAD_PK_U16_U8
-
-    // --- description from .arch file ---
-    // D.u = Masked Quad-Byte SAD with 16-bit packed accum_lo/hi(S0.u[63:0],
-    // ---  S1.u[31:0], S2.u[63:0])
-    void
-    Inst_VOP3__V_MQSAD_PK_U16_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_MQSAD_U32_U8 class methods ---
-
-    Inst_VOP3__V_MQSAD_U32_U8::Inst_VOP3__V_MQSAD_U32_U8(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mqsad_u32_u8", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MQSAD_U32_U8
-
-    Inst_VOP3__V_MQSAD_U32_U8::~Inst_VOP3__V_MQSAD_U32_U8()
-    {
-    } // ~Inst_VOP3__V_MQSAD_U32_U8
-
-    // --- description from .arch file ---
-    // D.u128 = Masked Quad-Byte SAD with 32-bit accum_lo/hi(S0.u[63:0],
-    // ---  S1.u[31:0], S2.u[127:0])
-    void
-    Inst_VOP3__V_MQSAD_U32_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_MAD_U64_U32 class methods ---
-
-    Inst_VOP3__V_MAD_U64_U32::Inst_VOP3__V_MAD_U64_U32(
-          InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_mad_u64_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_U64_U32
-
-    Inst_VOP3__V_MAD_U64_U32::~Inst_VOP3__V_MAD_U64_U32()
-    {
-    } // ~Inst_VOP3__V_MAD_U64_U32
-
-    // --- description from .arch file ---
-    // {vcc_out,D.u64} = S0.u32 * S1.u32 + S2.u64.
-    void
-    Inst_VOP3__V_MAD_U64_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
-        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
-        VecOperandU64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-        vdst.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, muladd(vdst[lane], src0[lane], src1[lane],
-                    src2[lane]));
-            }
-        }
-
-        vcc.write();
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAD_I64_I32 class methods ---
-
-    Inst_VOP3__V_MAD_I64_I32::Inst_VOP3__V_MAD_I64_I32(
-          InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_mad_i64_i32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_I64_I32
-
-    Inst_VOP3__V_MAD_I64_I32::~Inst_VOP3__V_MAD_I64_I32()
-    {
-    } // ~Inst_VOP3__V_MAD_I64_I32
-
-    // --- description from .arch file ---
-    // {vcc_out,D.i64} = S0.i32 * S1.i32 + S2.i64.
-    void
-    Inst_VOP3__V_MAD_I64_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandI64 src2(gpuDynInst, extData.SRC2);
-        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
-        VecOperandI64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, muladd(vdst[lane], src0[lane], src1[lane],
-                    src2[lane]));
-            }
-        }
-
-        vcc.write();
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_XAD_U32 class methods ---
-
-    Inst_VOP3__V_XAD_U32::Inst_VOP3__V_XAD_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_xad_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_XAD_U32
-
-    Inst_VOP3__V_XAD_U32::~Inst_VOP3__V_XAD_U32()
-    {
-    } // ~Inst_VOP3__V_XAD_U32
-
-    // --- description from .arch file ---
-    // D.u32 = (S0.u32 ^ S1.u32) + S2.u32.
-    void
-    Inst_VOP3__V_XAD_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (src0[lane] ^ src1[lane]) + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LSHL_ADD_U32 class methods ---
-
-    Inst_VOP3__V_LSHL_ADD_U32::Inst_VOP3__V_LSHL_ADD_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_lshl_add_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHL_ADD_U32
-
-    Inst_VOP3__V_LSHL_ADD_U32::~Inst_VOP3__V_LSHL_ADD_U32()
-    {
-    } // ~Inst_VOP3__V_LSHL_ADD_U32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u << S1.u[4:0]) + S2.u.
-    void
-    Inst_VOP3__V_LSHL_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (src0[lane] << bits(src1[lane], 4, 0))
-                           + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ADD_LSHL_U32 class methods ---
-
-    Inst_VOP3__V_ADD_LSHL_U32::Inst_VOP3__V_ADD_LSHL_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_add_lshl_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ADD_LSHL_U32
-
-    Inst_VOP3__V_ADD_LSHL_U32::~Inst_VOP3__V_ADD_LSHL_U32()
-    {
-    } // ~Inst_VOP3__V_ADD_LSHL_U32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u + S1.u) << S2.u[4:0].
-    void
-    Inst_VOP3__V_ADD_LSHL_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] =
-                    (src0[lane] + src1[lane]) << bits(src2[lane], 4, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ADD3_U32 class methods ---
-
-    Inst_VOP3__V_ADD3_U32::Inst_VOP3__V_ADD3_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_add3_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ADD3_U32
-
-    Inst_VOP3__V_ADD3_U32::~Inst_VOP3__V_ADD3_U32()
-    {
-    } // ~Inst_VOP3__V_ADD3_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u + S1.u + S2.u.
-    void
-    Inst_VOP3__V_ADD3_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane] + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LSHL_OR_B32 class methods ---
-
-    Inst_VOP3__V_LSHL_OR_B32::Inst_VOP3__V_LSHL_OR_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_lshl_or_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHL_OR_B32
-
-    Inst_VOP3__V_LSHL_OR_B32::~Inst_VOP3__V_LSHL_OR_B32()
-    {
-    } // ~Inst_VOP3__V_LSHL_OR_B32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u << S1.u[4:0]) | S2.u.
-    void
-    Inst_VOP3__V_LSHL_OR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (src0[lane] << bits(src1[lane], 4, 0))
-                           | src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_AND_OR_B32 class methods ---
-
-    Inst_VOP3__V_AND_OR_B32::Inst_VOP3__V_AND_OR_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_and_or_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_AND_OR_B32
-
-    Inst_VOP3__V_AND_OR_B32::~Inst_VOP3__V_AND_OR_B32()
-    {
-    } // ~Inst_VOP3__V_AND_OR_B32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u & S1.u) | S2.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP3__V_AND_OR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (src0[lane] & src1[lane]) | src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAD_F16 class methods ---
-
-    Inst_VOP3__V_MAD_F16::Inst_VOP3__V_MAD_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mad_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_F16
-
-    Inst_VOP3__V_MAD_F16::~Inst_VOP3__V_MAD_F16()
-    {
-    } // ~Inst_VOP3__V_MAD_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * S1.f16 + S2.f16.
-    // Supports round mode, exception flags, saturation.
-    void
-    Inst_VOP3__V_MAD_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_MAD_U16 class methods ---
-
-    Inst_VOP3__V_MAD_U16::Inst_VOP3__V_MAD_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mad_u16", false)
-    {
-        setFlag(ALU);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_U16
-
-    Inst_VOP3__V_MAD_U16::~Inst_VOP3__V_MAD_U16()
-    {
-    } // ~Inst_VOP3__V_MAD_U16
-
-    // --- description from .arch file ---
-    // D.u16 = S0.u16 * S1.u16 + S2.u16.
-    // Supports saturation (unsigned 16-bit integer domain).
-    void
-    Inst_VOP3__V_MAD_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU16 src2(gpuDynInst, extData.SRC2);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] * src1[lane] + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAD_I16 class methods ---
-
-    Inst_VOP3__V_MAD_I16::Inst_VOP3__V_MAD_I16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mad_i16", false)
-    {
-        setFlag(ALU);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_I16
-
-    Inst_VOP3__V_MAD_I16::~Inst_VOP3__V_MAD_I16()
-    {
-    } // ~Inst_VOP3__V_MAD_I16
-
-    // --- description from .arch file ---
-    // D.i16 = S0.i16 * S1.i16 + S2.i16.
-    // Supports saturation (signed 16-bit integer domain).
-    void
-    Inst_VOP3__V_MAD_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandI16 src2(gpuDynInst, extData.SRC2);
-        VecOperandI16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] * src1[lane] + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_PERM_B32 class methods ---
-
-    Inst_VOP3__V_PERM_B32::Inst_VOP3__V_PERM_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_perm_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_PERM_B32
-
-    Inst_VOP3__V_PERM_B32::~Inst_VOP3__V_PERM_B32()
-    {
-    } // ~Inst_VOP3__V_PERM_B32
-
-    // --- description from .arch file ---
-    // D.u[31:24] = permute({S0.u, S1.u}, S2.u[31:24]);
-    // D.u[23:16] = permute({S0.u, S1.u}, S2.u[23:16]);
-    // D.u[15:8] = permute({S0.u, S1.u}, S2.u[15:8]);
-    // D.u[7:0] = permute({S0.u, S1.u}, S2.u[7:0]);
-    // byte permute(byte in[8], byte sel) {
-    //     if(sel>=13) then return 0xff;
-    //     elsif(sel==12) then return 0x00;
-    //     elsif(sel==11) then return in[7][7] * 0xff;
-    //     elsif(sel==10) then return in[5][7] * 0xff;
-    //     elsif(sel==9) then return in[3][7] * 0xff;
-    //     elsif(sel==8) then return in[1][7] * 0xff;
-    //     else return in[sel];
-    //     }
-    // Byte permute.
-    void
-    Inst_VOP3__V_PERM_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemU64 selector = (VecElemU64)src0[lane];
-                selector = (selector << 32) | (VecElemU64)src1[lane];
-                vdst[lane] = 0;
-
-                DPRINTF(VEGA, "Executing v_perm_b32 src_0 0x%08x, src_1 "
-                        "0x%08x, src_2 0x%08x, vdst 0x%08x\n", src0[lane],
-                        src1[lane], src2[lane], vdst[lane]);
-                DPRINTF(VEGA, "Selector: 0x%08x \n", selector);
-
-                for (int i = 0; i < 4 ; ++i) {
-                    VecElemU32 permuted_val = permute(selector, 0xFF
-                        & ((VecElemU32)src2[lane] >> (8 * i)));
-                    vdst[lane] |= (permuted_val << (8 * i));
-                }
-
-                DPRINTF(VEGA, "v_perm result: 0x%08x\n", vdst[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FMA_F16 class methods ---
-
-    Inst_VOP3__V_FMA_F16::Inst_VOP3__V_FMA_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_fma_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(FMA);
-    } // Inst_VOP3__V_FMA_F16
-
-    Inst_VOP3__V_FMA_F16::~Inst_VOP3__V_FMA_F16()
-    {
-    } // ~Inst_VOP3__V_FMA_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * S1.f16 + S2.f16.
-    // Fused half precision multiply add.
-    void
-    Inst_VOP3__V_FMA_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_DIV_FIXUP_F16 class methods ---
-
-    Inst_VOP3__V_DIV_FIXUP_F16::Inst_VOP3__V_DIV_FIXUP_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_div_fixup_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_DIV_FIXUP_F16
-
-    Inst_VOP3__V_DIV_FIXUP_F16::~Inst_VOP3__V_DIV_FIXUP_F16()
-    {
-    } // ~Inst_VOP3__V_DIV_FIXUP_F16
-
-    // --- description from .arch file ---
-    // sign_out =  sign(S1.f16)^sign(S2.f16);
-    // if (S2.f16 == NAN)
-    //     D.f16 = Quiet(S2.f16);
-    // else if (S1.f16 == NAN)
-    //     D.f16 = Quiet(S1.f16);
-    // else if (S1.f16 == S2.f16 == 0)
-    //     # 0/0
-    //     D.f16 = pele_nan(0xfe00);
-    // else if (abs(S1.f16) == abs(S2.f16) == +-INF)
-    //     # inf/inf
-    //     D.f16 = pele_nan(0xfe00);
-    // else if (S1.f16 ==0 || abs(S2.f16) == +-INF)
-    //     # x/0, or inf/y
-    //     D.f16 = sign_out ? -INF : INF;
-    // else if (abs(S1.f16) == +-INF || S2.f16 == 0)
-    //     # x/inf, 0/y
-    //     D.f16 = sign_out ? -0 : 0;
-    // else if ((exp(S2.f16) - exp(S1.f16)) < -150)
-    //     D.f16 = sign_out ? -underflow : underflow;
-    // else if (exp(S1.f16) == 255)
-    //     D.f16 = sign_out ? -overflow : overflow;
-    // else
-    //     D.f16 = sign_out ? -abs(S0.f16) : abs(S0.f16).
-    // Half precision division fixup.
-    // S0 = Quotient, S1 = Denominator, S3 = Numerator.
-    // Given a numerator, denominator, and quotient from a divide, this opcode
-    // will detect and apply special case numerics, touching up the quotient if
-    // necessary. This opcode also generates invalid, denorm and divide by
-    // zero exceptions caused by the division.
-    void
-    Inst_VOP3__V_DIV_FIXUP_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_PKACCUM_U8_F32 class methods ---
-
-    Inst_VOP3__V_CVT_PKACCUM_U8_F32::Inst_VOP3__V_CVT_PKACCUM_U8_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_pkaccum_u8_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_PKACCUM_U8_F32
-
-    Inst_VOP3__V_CVT_PKACCUM_U8_F32::~Inst_VOP3__V_CVT_PKACCUM_U8_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_PKACCUM_U8_F32
-
-    // --- description from .arch file ---
-    // byte = S1.u[1:0]; bit = byte * 8;
-    // D.u[bit+7:bit] = flt32_to_uint8(S0.f);
-    // Pack converted value of S0.f into byte S1 of the destination.
-    // SQ translates to V_CVT_PK_U8_F32.
-    // Note: this opcode uses src_c to pass destination in as a source.
-    void
-    Inst_VOP3__V_CVT_PKACCUM_U8_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_INTERP_P1_F32 class methods ---
-
-    Inst_VOP3__V_INTERP_P1_F32::Inst_VOP3__V_INTERP_P1_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_interp_p1_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_INTERP_P1_F32
-
-    Inst_VOP3__V_INTERP_P1_F32::~Inst_VOP3__V_INTERP_P1_F32()
-    {
-    } // ~Inst_VOP3__V_INTERP_P1_F32
-
-    // --- description from .arch file ---
-    // D.f = P10 * S.f + P0; parameter interpolation (SQ translates to
-    // V_MAD_F32 for SP).
-    // CAUTION: when in HALF_LDS mode, D must not be the same GPR as S; if
-    // D == S then data corruption will occur.
-    // NOTE: In textual representations the I/J VGPR is the first source and
-    // the attribute is the second source; however in the VOP3 encoding the
-    // attribute is stored in the src0 field and the VGPR is stored in the
-    // src1 field.
-    void
-    Inst_VOP3__V_INTERP_P1_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_INTERP_P2_F32 class methods ---
-
-    Inst_VOP3__V_INTERP_P2_F32::Inst_VOP3__V_INTERP_P2_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_interp_p2_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_INTERP_P2_F32
-
-    Inst_VOP3__V_INTERP_P2_F32::~Inst_VOP3__V_INTERP_P2_F32()
-    {
-    } // ~Inst_VOP3__V_INTERP_P2_F32
-
-    // --- description from .arch file ---
-    // D.f = P20 * S.f + D.f; parameter interpolation (SQ translates to
-    // V_MAD_F32 for SP).
-    // NOTE: In textual representations the I/J VGPR is the first source and
-    // the attribute is the second source; however in the VOP3 encoding the
-    // attribute is stored in the src0 field and the VGPR is stored in the
-    // src1 field.
-    void
-    Inst_VOP3__V_INTERP_P2_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_INTERP_MOV_F32 class methods ---
-
-    Inst_VOP3__V_INTERP_MOV_F32::Inst_VOP3__V_INTERP_MOV_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_interp_mov_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_INTERP_MOV_F32
-
-    Inst_VOP3__V_INTERP_MOV_F32::~Inst_VOP3__V_INTERP_MOV_F32()
-    {
-    } // ~Inst_VOP3__V_INTERP_MOV_F32
-
-    // --- description from .arch file ---
-    // D.f = {P10,P20,P0}[S.u]; parameter load.
-    void
-    Inst_VOP3__V_INTERP_MOV_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_INTERP_P1LL_F16 class methods ---
-
-    Inst_VOP3__V_INTERP_P1LL_F16::Inst_VOP3__V_INTERP_P1LL_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_interp_p1ll_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_INTERP_P1LL_F16
-
-    Inst_VOP3__V_INTERP_P1LL_F16::~Inst_VOP3__V_INTERP_P1LL_F16()
-    {
-    } // ~Inst_VOP3__V_INTERP_P1LL_F16
-
-    // --- description from .arch file ---
-    // D.f32 = P10.f16 * S0.f32 + P0.f16.
-    // 'LL' stands for 'two LDS arguments'.
-    // attr_word selects the high or low half 16 bits of each LDS dword
-    // accessed.
-    // This opcode is available for 32-bank LDS only.
-    // NOTE: In textual representations the I/J VGPR is the first source and
-    // the attribute is the second source; however in the VOP3 encoding the
-    // attribute is stored in the src0 field and the VGPR is stored in the
-    // src1 field.
-    void
-    Inst_VOP3__V_INTERP_P1LL_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_INTERP_P1LV_F16 class methods ---
-
-    Inst_VOP3__V_INTERP_P1LV_F16::Inst_VOP3__V_INTERP_P1LV_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_interp_p1lv_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_INTERP_P1LV_F16
-
-    Inst_VOP3__V_INTERP_P1LV_F16::~Inst_VOP3__V_INTERP_P1LV_F16()
-    {
-    } // ~Inst_VOP3__V_INTERP_P1LV_F16
-
-    // --- description from .arch file ---
-    // D.f32 = P10.f16 * S0.f32 + (S2.u32 >> (attr_word * 16)).f16.
-    // 'LV' stands for 'One LDS and one VGPR argument'.
-    // S2 holds two parameters, attr_word selects the high or low word of the
-    // VGPR for this calculation, as well as the high or low half of the LDS
-    // data.
-    // Meant for use with 16-bank LDS.
-    // NOTE: In textual representations the I/J VGPR is the first source and
-    // the attribute is the second source; however in the VOP3 encoding the
-    // attribute is stored in the src0 field and the VGPR is stored in the
-    // src1 field.
-    void
-    Inst_VOP3__V_INTERP_P1LV_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_INTERP_P2_F16 class methods ---
-
-    Inst_VOP3__V_INTERP_P2_F16::Inst_VOP3__V_INTERP_P2_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_interp_p2_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_INTERP_P2_F16
-
-    Inst_VOP3__V_INTERP_P2_F16::~Inst_VOP3__V_INTERP_P2_F16()
-    {
-    } // ~Inst_VOP3__V_INTERP_P2_F16
-
-    // --- description from .arch file ---
-    // D.f16 = P20.f16 * S0.f32 + S2.f32.
-    // Final computation. attr_word selects LDS high or low 16bits. Used for
-    // both 16- and 32-bank LDS.
-    // Result is always written to the 16 LSBs of the destination VGPR.
-    // NOTE: In textual representations the I/J VGPR is the first source and
-    // the attribute is the second source; however in the VOP3 encoding the
-    // attribute is stored in the src0 field and the VGPR is stored in the
-    // src1 field.
-    void
-    Inst_VOP3__V_INTERP_P2_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_ADD_F64 class methods ---
-
-    Inst_VOP3__V_ADD_F64::Inst_VOP3__V_ADD_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_add_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_ADD_F64
-
-    Inst_VOP3__V_ADD_F64::~Inst_VOP3__V_ADD_F64()
-    {
-    } // ~Inst_VOP3__V_ADD_F64
-
-    // --- description from .arch file ---
-    // D.d = S0.d + S1.d.
-    void
-    Inst_VOP3__V_ADD_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isnan(src0[lane]) ||
-                    std::isnan(src1[lane]) ) {
-                        vdst[lane] = NAN;
-                } else if (std::isinf(src0[lane]) &&
-                           std::isinf(src1[lane])) {
-                    if (std::signbit(src0[lane]) !=
-                        std::signbit(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else {
-                        vdst[lane] = src0[lane];
-                    }
-                } else if (std::isinf(src0[lane])) {
-                    vdst[lane] = src0[lane];
-                } else if (std::isinf(src1[lane])) {
-                    vdst[lane] = src1[lane];
-                } else if (std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        if (std::signbit(src0[lane]) &&
-                            std::signbit(src1[lane])) {
-                            vdst[lane] = -0.0;
-                        } else {
-                            vdst[lane] = 0.0;
-                        }
-                    } else {
-                        vdst[lane] = src1[lane];
-                    }
-                } else if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src1[lane]) == FP_ZERO) {
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src0[lane]) == FP_ZERO) {
-                        if (std::signbit(src0[lane]) &&
-                            std::signbit(src1[lane])) {
-                            vdst[lane] = -0.0;
-                        } else {
-                            vdst[lane] = 0.0;
-                        }
-                    } else {
-                        vdst[lane] = src0[lane];
-                    }
-                } else {
-                    vdst[lane] = src0[lane] + src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_F64 class methods ---
-
-    Inst_VOP3__V_MUL_F64::Inst_VOP3__V_MUL_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_MUL_F64
-
-    Inst_VOP3__V_MUL_F64::~Inst_VOP3__V_MUL_F64()
-    {
-    } // ~Inst_VOP3__V_MUL_F64
-
-    // --- description from .arch file ---
-    // D.d = S0.d * S1.d.
-    void
-    Inst_VOP3__V_MUL_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isnan(src0[lane]) ||
-                    std::isnan(src1[lane])) {
-                    vdst[lane] = NAN;
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else {
-                    vdst[lane] = src0[lane] * src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MIN_F64 class methods ---
-
-    Inst_VOP3__V_MIN_F64::Inst_VOP3__V_MIN_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_MIN_F64
-
-    Inst_VOP3__V_MIN_F64::~Inst_VOP3__V_MIN_F64()
-    {
-    } // ~Inst_VOP3__V_MIN_F64
-
-    // --- description from .arch file ---
-    // D.d = min(S0.d, S1.d).
-    void
-    Inst_VOP3__V_MIN_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fmin(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAX_F64 class methods ---
-
-    Inst_VOP3__V_MAX_F64::Inst_VOP3__V_MAX_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_MAX_F64
-
-    Inst_VOP3__V_MAX_F64::~Inst_VOP3__V_MAX_F64()
-    {
-    } // ~Inst_VOP3__V_MAX_F64
-
-    // --- description from .arch file ---
-    // D.d = max(S0.d, S1.d).
-    void
-    Inst_VOP3__V_MAX_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fmax(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LDEXP_F64 class methods ---
-
-    Inst_VOP3__V_LDEXP_F64::Inst_VOP3__V_LDEXP_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ldexp_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_LDEXP_F64
-
-    Inst_VOP3__V_LDEXP_F64::~Inst_VOP3__V_LDEXP_F64()
-    {
-    } // ~Inst_VOP3__V_LDEXP_F64
-
-    // --- description from .arch file ---
-    // D.d = pow(S0.d, S1.i[31:0]).
-    void
-    Inst_VOP3__V_LDEXP_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isnan(src0[lane]) || std::isinf(src0[lane])) {
-                    vdst[lane] = src0[lane];
-                } else if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                           || std::fpclassify(src0[lane]) == FP_ZERO) {
-                    if (std::signbit(src0[lane])) {
-                        vdst[lane] = -0.0;
-                    } else {
-                        vdst[lane] = +0.0;
-                    }
-                } else {
-                    vdst[lane] = std::ldexp(src0[lane], src1[lane]);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_LO_U32 class methods ---
-
-    Inst_VOP3__V_MUL_LO_U32::Inst_VOP3__V_MUL_LO_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_lo_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_LO_U32
-
-    Inst_VOP3__V_MUL_LO_U32::~Inst_VOP3__V_MUL_LO_U32()
-    {
-    } // ~Inst_VOP3__V_MUL_LO_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u * S1.u.
-    void
-    Inst_VOP3__V_MUL_LO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI64 s0 = (VecElemI64)src0[lane];
-                VecElemI64 s1 = (VecElemI64)src1[lane];
-                vdst[lane] = (VecElemU32)((s0 * s1) & 0xffffffffLL);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_HI_U32 class methods ---
-
-    Inst_VOP3__V_MUL_HI_U32::Inst_VOP3__V_MUL_HI_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_hi_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_HI_U32
-
-    Inst_VOP3__V_MUL_HI_U32::~Inst_VOP3__V_MUL_HI_U32()
-    {
-    } // ~Inst_VOP3__V_MUL_HI_U32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u * S1.u) >> 32.
-    void
-    Inst_VOP3__V_MUL_HI_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI64 s0 = (VecElemI64)src0[lane];
-                VecElemI64 s1 = (VecElemI64)src1[lane];
-                vdst[lane]
-                    = (VecElemU32)(((s0 * s1) >> 32) & 0xffffffffLL);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_HI_I32 class methods ---
-
-    Inst_VOP3__V_MUL_HI_I32::Inst_VOP3__V_MUL_HI_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_hi_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_HI_I32
-
-    Inst_VOP3__V_MUL_HI_I32::~Inst_VOP3__V_MUL_HI_I32()
-    {
-    } // ~Inst_VOP3__V_MUL_HI_I32
-
-    // --- description from .arch file ---
-    // D.i = (S0.i * S1.i) >> 32.
-    void
-    Inst_VOP3__V_MUL_HI_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI64 s0 = (VecElemI64)src0[lane];
-                VecElemI64 s1 = (VecElemI64)src1[lane];
-                vdst[lane]
-                    = (VecElemI32)(((s0 * s1) >> 32LL) & 0xffffffffLL);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LDEXP_F32 class methods ---
-
-    Inst_VOP3__V_LDEXP_F32::Inst_VOP3__V_LDEXP_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ldexp_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_LDEXP_F32
-
-    Inst_VOP3__V_LDEXP_F32::~Inst_VOP3__V_LDEXP_F32()
-    {
-    } // ~Inst_VOP3__V_LDEXP_F32
-
-    // --- description from .arch file ---
-    // D.f = pow(S0.f, S1.i)
-    void
-    Inst_VOP3__V_LDEXP_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::ldexp(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_READLANE_B32 class methods ---
-
-    Inst_VOP3__V_READLANE_B32::Inst_VOP3__V_READLANE_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_readlane_b32", true)
-    {
-        setFlag(ALU);
-        setFlag(IgnoreExec);
-    } // Inst_VOP3__V_READLANE_B32
-
-    Inst_VOP3__V_READLANE_B32::~Inst_VOP3__V_READLANE_B32()
-    {
-    } // ~Inst_VOP3__V_READLANE_B32
-
-    // --- description from .arch file ---
-    // Copy one VGPR value to one SGPR. D = SGPR-dest, S0 = Source Data (VGPR#
-    // or M0(lds-direct)), S1 = Lane Select (SGPR or M0). Ignores exec mask.
-    // Input and output modifiers not supported; this is an untyped operation.
-    void
-    Inst_VOP3__V_READLANE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        sdst = src0[src1.rawData() & 0x3f];
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_WRITELANE_B32 class methods ---
-
-    Inst_VOP3__V_WRITELANE_B32::Inst_VOP3__V_WRITELANE_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_writelane_b32", false)
-    {
-        setFlag(ALU);
-        setFlag(IgnoreExec);
-    } // Inst_VOP3__V_WRITELANE_B32
-
-    Inst_VOP3__V_WRITELANE_B32::~Inst_VOP3__V_WRITELANE_B32()
-    {
-    } // ~Inst_VOP3__V_WRITELANE_B32
-
-    // --- description from .arch file ---
-    // Write value into one VGPR in one lane. D = VGPR-dest, S0 = Source Data
-    // (sgpr, m0, exec or constants), S1 = Lane Select (SGPR or M0). Ignores
-    // exec mask.
-    // Input and output modifiers not supported; this is an untyped operation.
-    // SQ translates to V_MOV_B32.
-    void
-    Inst_VOP3__V_WRITELANE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.read();
-        src1.read();
-        vdst.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        vdst[src1.rawData() & 0x3f] = src0.rawData();
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_BCNT_U32_B32 class methods ---
-
-    Inst_VOP3__V_BCNT_U32_B32::Inst_VOP3__V_BCNT_U32_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_bcnt_u32_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_BCNT_U32_B32
-
-    Inst_VOP3__V_BCNT_U32_B32::~Inst_VOP3__V_BCNT_U32_B32()
-    {
-    } // ~Inst_VOP3__V_BCNT_U32_B32
-
-    // --- description from .arch file ---
-    // D.u = CountOneBits(S0.u) + S1.u. Bit count.
-    void
-    Inst_VOP3__V_BCNT_U32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = popCount(src0[lane]) + src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MBCNT_LO_U32_B32 class methods ---
-
-    Inst_VOP3__V_MBCNT_LO_U32_B32::Inst_VOP3__V_MBCNT_LO_U32_B32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mbcnt_lo_u32_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MBCNT_LO_U32_B32
-
-    Inst_VOP3__V_MBCNT_LO_U32_B32::~Inst_VOP3__V_MBCNT_LO_U32_B32()
-    {
-    } // ~Inst_VOP3__V_MBCNT_LO_U32_B32
-
-    // --- description from .arch file ---
-    // ThreadMask = (1 << ThreadPosition) - 1;
-    // D.u = CountOneBits(S0.u & ThreadMask[31:0]) + S1.u.
-    // Masked bit count, ThreadPosition is the position of this thread in the
-    // ---  wavefront (in 0..63).
-    void
-    Inst_VOP3__V_MBCNT_LO_U32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        uint64_t threadMask = 0;
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                threadMask = ((1LL << lane) - 1LL);
-                vdst[lane] = popCount(src0[lane] & bits(threadMask, 31, 0)) +
-                             src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MBCNT_HI_U32_B32 class methods ---
-
-    Inst_VOP3__V_MBCNT_HI_U32_B32::Inst_VOP3__V_MBCNT_HI_U32_B32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mbcnt_hi_u32_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MBCNT_HI_U32_B32
-
-    Inst_VOP3__V_MBCNT_HI_U32_B32::~Inst_VOP3__V_MBCNT_HI_U32_B32()
-    {
-    } // ~Inst_VOP3__V_MBCNT_HI_U32_B32
-
-    // --- description from .arch file ---
-    // ThreadMask = (1 << ThreadPosition) - 1;
-    // D.u = CountOneBits(S0.u & ThreadMask[63:32]) + S1.u.
-    // Masked bit count, ThreadPosition is the position of this thread in the
-    // ---  wavefront (in 0..63).
-    void
-    Inst_VOP3__V_MBCNT_HI_U32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        uint64_t threadMask = 0;
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                threadMask = ((1LL << lane) - 1LL);
-                vdst[lane] = popCount(src0[lane] & bits(threadMask, 63, 32)) +
-                             src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LSHLREV_B64 class methods ---
-
-    Inst_VOP3__V_LSHLREV_B64::Inst_VOP3__V_LSHLREV_B64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_lshlrev_b64", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHLREV_B64
-
-    Inst_VOP3__V_LSHLREV_B64::~Inst_VOP3__V_LSHLREV_B64()
-    {
-    } // ~Inst_VOP3__V_LSHLREV_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S1.u64 << S0.u[5:0].
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP3__V_LSHLREV_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        VecOperandU64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] << bits(src0[lane], 5, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LSHRREV_B64 class methods ---
-
-    Inst_VOP3__V_LSHRREV_B64::Inst_VOP3__V_LSHRREV_B64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_lshrrev_b64", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHRREV_B64
-
-    Inst_VOP3__V_LSHRREV_B64::~Inst_VOP3__V_LSHRREV_B64()
-    {
-    } // ~Inst_VOP3__V_LSHRREV_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S1.u64 >> S0.u[5:0].
-    // The vacated bits are set to zero.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP3__V_LSHRREV_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        VecOperandU64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> bits(src0[lane], 5, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ASHRREV_I64 class methods ---
-
-    Inst_VOP3__V_ASHRREV_I64::Inst_VOP3__V_ASHRREV_I64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ashrrev_i64", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ASHRREV_I64
-
-    Inst_VOP3__V_ASHRREV_I64::~Inst_VOP3__V_ASHRREV_I64()
-    {
-    } // ~Inst_VOP3__V_ASHRREV_I64
-
-    // --- description from .arch file ---
-    // D.u64 = signext(S1.u64) >> S0.u[5:0].
-    // The vacated bits are set to the sign bit of the input value.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP3__V_ASHRREV_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        VecOperandU64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane]
-                    = src1[lane] >> bits(src0[lane], 5, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_TRIG_PREOP_F64 class methods ---
-
-    Inst_VOP3__V_TRIG_PREOP_F64::Inst_VOP3__V_TRIG_PREOP_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_trig_preop_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_TRIG_PREOP_F64
-
-    Inst_VOP3__V_TRIG_PREOP_F64::~Inst_VOP3__V_TRIG_PREOP_F64()
-    {
-    } // ~Inst_VOP3__V_TRIG_PREOP_F64
-
-    // --- description from .arch file ---
-    // D.d = Look Up 2/PI (S0.d) with segment select S1.u[4:0]. This operation
-    // returns an aligned, double precision segment of 2/PI needed to do range
-    // reduction on S0.d (double-precision value). Multiple segments can be
-    // specified through S1.u[4:0]. Rounding is always round-to-zero. Large
-    // inputs (exp > 1968) are scaled to avoid loss of precision through
-    // denormalization.
-    void
-    Inst_VOP3__V_TRIG_PREOP_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_BFM_B32 class methods ---
-
-    Inst_VOP3__V_BFM_B32::Inst_VOP3__V_BFM_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_bfm_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_BFM_B32
-
-    Inst_VOP3__V_BFM_B32::~Inst_VOP3__V_BFM_B32()
-    {
-    } // ~Inst_VOP3__V_BFM_B32
-
-    // --- description from .arch file ---
-    // D.u = ((1<<S0.u[4:0])-1) << S1.u[4:0]; S0 is the bitfield width and S1
-    // is the bitfield offset.
-    void
-    Inst_VOP3__V_BFM_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = ((1 << bits(src0[lane], 4, 0)) - 1)
-                    << bits(src1[lane], 4, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_PKNORM_I16_F32 class methods ---
-
-    Inst_VOP3__V_CVT_PKNORM_I16_F32::Inst_VOP3__V_CVT_PKNORM_I16_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_pknorm_i16_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_PKNORM_I16_F32
-
-    Inst_VOP3__V_CVT_PKNORM_I16_F32::~Inst_VOP3__V_CVT_PKNORM_I16_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_PKNORM_I16_F32
-
-    // --- description from .arch file ---
-    // D = {(snorm)S1.f, (snorm)S0.f}.
-    void
-    Inst_VOP3__V_CVT_PKNORM_I16_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_PKNORM_U16_F32 class methods ---
-
-    Inst_VOP3__V_CVT_PKNORM_U16_F32::Inst_VOP3__V_CVT_PKNORM_U16_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_pknorm_u16_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_PKNORM_U16_F32
-
-    Inst_VOP3__V_CVT_PKNORM_U16_F32::~Inst_VOP3__V_CVT_PKNORM_U16_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_PKNORM_U16_F32
-
-    // --- description from .arch file ---
-    // D = {(unorm)S1.f, (unorm)S0.f}.
-    void
-    Inst_VOP3__V_CVT_PKNORM_U16_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_PKRTZ_F16_F32 class methods ---
-
-    Inst_VOP3__V_CVT_PKRTZ_F16_F32::Inst_VOP3__V_CVT_PKRTZ_F16_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_pkrtz_f16_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_PKRTZ_F16_F32
-
-    Inst_VOP3__V_CVT_PKRTZ_F16_F32::~Inst_VOP3__V_CVT_PKRTZ_F16_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_PKRTZ_F16_F32
-
-    // --- description from .arch file ---
-    // D = {flt32_to_flt16(S1.f),flt32_to_flt16(S0.f)}, with round-toward-zero
-    // ---  regardless of current round mode setting in hardware.
-    // This opcode is intended for use with 16-bit compressed exports.
-    // See V_CVT_F16_F32 for a version that respects the current rounding mode.
-    void
-    Inst_VOP3__V_CVT_PKRTZ_F16_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_PK_U16_U32 class methods ---
-
-    Inst_VOP3__V_CVT_PK_U16_U32::Inst_VOP3__V_CVT_PK_U16_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_pk_u16_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CVT_PK_U16_U32
-
-    Inst_VOP3__V_CVT_PK_U16_U32::~Inst_VOP3__V_CVT_PK_U16_U32()
-    {
-    } // ~Inst_VOP3__V_CVT_PK_U16_U32
-
-    // --- description from .arch file ---
-    // D = {uint32_to_uint16(S1.u), uint32_to_uint16(S0.u)}.
-    void
-    Inst_VOP3__V_CVT_PK_U16_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_PK_I16_I32 class methods ---
-
-    Inst_VOP3__V_CVT_PK_I16_I32::Inst_VOP3__V_CVT_PK_I16_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_pk_i16_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CVT_PK_I16_I32
-
-    Inst_VOP3__V_CVT_PK_I16_I32::~Inst_VOP3__V_CVT_PK_I16_I32()
-    {
-    } // ~Inst_VOP3__V_CVT_PK_I16_I32
-
-    // --- description from .arch file ---
-    // D = {int32_to_int16(S1.i), int32_to_int16(S0.i)}.
-    void
-    Inst_VOP3__V_CVT_PK_I16_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_ADD_U32 class methods ---
-
-    Inst_DS__DS_ADD_U32::Inst_DS__DS_ADD_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_u32")
-    {
-        setFlag(MemoryRef);
-        setFlag(GroupSegment);
-        setFlag(AtomicAdd);
-        setFlag(AtomicNoReturn);
-    } // Inst_DS__DS_ADD_U32
-
-    Inst_DS__DS_ADD_U32::~Inst_DS__DS_ADD_U32()
-    {
-    } // ~Inst_DS__DS_ADD_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // MEM[ADDR] += DATA;
-    void
-    Inst_DS__DS_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_ADD_U32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initAtomicAccess<VecElemU32>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_ADD_U32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_SUB_U32 class methods ---
-
-    Inst_DS__DS_SUB_U32::Inst_DS__DS_SUB_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_sub_u32")
-    {
-    } // Inst_DS__DS_SUB_U32
-
-    Inst_DS__DS_SUB_U32::~Inst_DS__DS_SUB_U32()
-    {
-    } // ~Inst_DS__DS_SUB_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_RSUB_U32 class methods ---
-
-    Inst_DS__DS_RSUB_U32::Inst_DS__DS_RSUB_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_rsub_u32")
-    {
-    } // Inst_DS__DS_RSUB_U32
-
-    Inst_DS__DS_RSUB_U32::~Inst_DS__DS_RSUB_U32()
-    {
-    } // ~Inst_DS__DS_RSUB_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA - MEM[ADDR];
-    // RETURN_DATA = tmp.
-    // Subtraction with reversed operands.
-    void
-    Inst_DS__DS_RSUB_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_INC_U32 class methods ---
-
-    Inst_DS__DS_INC_U32::Inst_DS__DS_INC_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_inc_u32")
-    {
-    } // Inst_DS__DS_INC_U32
-
-    Inst_DS__DS_INC_U32::~Inst_DS__DS_INC_U32()
-    {
-    } // ~Inst_DS__DS_INC_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_INC_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_DEC_U32 class methods ---
-
-    Inst_DS__DS_DEC_U32::Inst_DS__DS_DEC_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_dec_u32")
-    {
-    } // Inst_DS__DS_DEC_U32
-
-    Inst_DS__DS_DEC_U32::~Inst_DS__DS_DEC_U32()
-    {
-    } // ~Inst_DS__DS_DEC_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
-    // (unsigned compare); RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_DEC_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_I32 class methods ---
-
-    Inst_DS__DS_MIN_I32::Inst_DS__DS_MIN_I32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_i32")
-    {
-    } // Inst_DS__DS_MIN_I32
-
-    Inst_DS__DS_MIN_I32::~Inst_DS__DS_MIN_I32()
-    {
-    } // ~Inst_DS__DS_MIN_I32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_I32 class methods ---
-
-    Inst_DS__DS_MAX_I32::Inst_DS__DS_MAX_I32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_i32")
-    {
-    } // Inst_DS__DS_MAX_I32
-
-    Inst_DS__DS_MAX_I32::~Inst_DS__DS_MAX_I32()
-    {
-    } // ~Inst_DS__DS_MAX_I32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_U32 class methods ---
-
-    Inst_DS__DS_MIN_U32::Inst_DS__DS_MIN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_u32")
-    {
-    } // Inst_DS__DS_MIN_U32
-
-    Inst_DS__DS_MIN_U32::~Inst_DS__DS_MIN_U32()
-    {
-    } // ~Inst_DS__DS_MIN_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_U32 class methods ---
-
-    Inst_DS__DS_MAX_U32::Inst_DS__DS_MAX_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_u32")
-    {
-    } // Inst_DS__DS_MAX_U32
-
-    Inst_DS__DS_MAX_U32::~Inst_DS__DS_MAX_U32()
-    {
-    } // ~Inst_DS__DS_MAX_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_AND_B32 class methods ---
-
-    Inst_DS__DS_AND_B32::Inst_DS__DS_AND_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_and_b32")
-    {
-    } // Inst_DS__DS_AND_B32
-
-    Inst_DS__DS_AND_B32::~Inst_DS__DS_AND_B32()
-    {
-    } // ~Inst_DS__DS_AND_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_AND_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_OR_B32 class methods ---
-
-    Inst_DS__DS_OR_B32::Inst_DS__DS_OR_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_or_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(GroupSegment);
-        setFlag(AtomicOr);
-        setFlag(AtomicNoReturn);
-    } // Inst_DS__DS_OR_B32
-
-    Inst_DS__DS_OR_B32::~Inst_DS__DS_OR_B32()
-    {
-    } // ~Inst_DS__DS_OR_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // MEM[ADDR] |= DATA;
-    void
-    Inst_DS__DS_OR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_OR_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initAtomicAccess<VecElemU32>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_OR_B32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-
-    // --- Inst_DS__DS_XOR_B32 class methods ---
-
-    Inst_DS__DS_XOR_B32::Inst_DS__DS_XOR_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_xor_b32")
-    {
-    } // Inst_DS__DS_XOR_B32
-
-    Inst_DS__DS_XOR_B32::~Inst_DS__DS_XOR_B32()
-    {
-    } // ~Inst_DS__DS_XOR_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MSKOR_B32 class methods ---
-
-    Inst_DS__DS_MSKOR_B32::Inst_DS__DS_MSKOR_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_mskor_b32")
-    {
-    } // Inst_DS__DS_MSKOR_B32
-
-    Inst_DS__DS_MSKOR_B32::~Inst_DS__DS_MSKOR_B32()
-    {
-    } // ~Inst_DS__DS_MSKOR_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
-    // RETURN_DATA = tmp.
-    // Masked dword OR, D0 contains the mask and D1 contains the new value.
-    void
-    Inst_DS__DS_MSKOR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRITE_B32 class methods ---
-
-    Inst_DS__DS_WRITE_B32::Inst_DS__DS_WRITE_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_B32
-
-    Inst_DS__DS_WRITE_B32::~Inst_DS__DS_WRITE_B32()
-    {
-    } // ~Inst_DS__DS_WRITE_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // MEM[ADDR] = DATA.
-    // Write dword.
-    void
-    Inst_DS__DS_WRITE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemWrite<VecElemU32>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_WRITE_B32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_WRITE2_B32 class methods ---
-
-    Inst_DS__DS_WRITE2_B32::Inst_DS__DS_WRITE2_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write2_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE2_B32
-
-    Inst_DS__DS_WRITE2_B32::~Inst_DS__DS_WRITE2_B32()
-    {
-    } // ~Inst_DS__DS_WRITE2_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // MEM[ADDR_BASE + OFFSET0 * 4] = DATA;
-    // MEM[ADDR_BASE + OFFSET1 * 4] = DATA2.
-    // Write 2 dwords.
-    void
-    Inst_DS__DS_WRITE2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
-        ConstVecOperandU32 data1(gpuDynInst, extData.DATA1);
-
-        addr.read();
-        data0.read();
-        data1.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 2]
-                    = data0[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE2_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0 * 4;
-        Addr offset1 = instData.OFFSET1 * 4;
-
-        initDualMemWrite<VecElemU32>(gpuDynInst, offset0, offset1);
-    }
-
-    void
-    Inst_DS__DS_WRITE2_B32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-    // --- Inst_DS__DS_WRITE2ST64_B32 class methods ---
-
-    Inst_DS__DS_WRITE2ST64_B32::Inst_DS__DS_WRITE2ST64_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write2st64_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE2ST64_B32
-
-    Inst_DS__DS_WRITE2ST64_B32::~Inst_DS__DS_WRITE2ST64_B32()
-    {
-    } // ~Inst_DS__DS_WRITE2ST64_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // MEM[ADDR_BASE + OFFSET0 * 4 * 64] = DATA;
-    // MEM[ADDR_BASE + OFFSET1 * 4 * 64] = DATA2;
-    // Write 2 dwords.
-    void
-    Inst_DS__DS_WRITE2ST64_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
-        ConstVecOperandU32 data1(gpuDynInst, extData.DATA1);
-
-        addr.read();
-        data0.read();
-        data1.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 2]
-                    = data0[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE2ST64_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0 * 4 * 64;
-        Addr offset1 = instData.OFFSET1 * 4 * 64;
-
-        initDualMemWrite<VecElemU32>(gpuDynInst, offset0, offset1);
-    }
-
-    void
-    Inst_DS__DS_WRITE2ST64_B32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-    // --- Inst_DS__DS_CMPST_B32 class methods ---
-
-    Inst_DS__DS_CMPST_B32::Inst_DS__DS_CMPST_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_b32")
-    {
-    } // Inst_DS__DS_CMPST_B32
-
-    Inst_DS__DS_CMPST_B32::~Inst_DS__DS_CMPST_B32()
-    {
-    } // ~Inst_DS__DS_CMPST_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    // Compare and store.
-    // Caution, the order of src and cmp are the *opposite* of the
-    // ---  BUFFER_ATOMIC_CMPSWAP opcode.
-    void
-    Inst_DS__DS_CMPST_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_CMPST_F32 class methods ---
-
-    Inst_DS__DS_CMPST_F32::Inst_DS__DS_CMPST_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_CMPST_F32
-
-    Inst_DS__DS_CMPST_F32::~Inst_DS__DS_CMPST_F32()
-    {
-    } // ~Inst_DS__DS_CMPST_F32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    // Floating point compare and store that handles NaN/INF/denormal values.
-    // Caution, the order of src and cmp are the *opposite* of the
-    // ---  BUFFER_ATOMIC_FCMPSWAP opcode.
-    void
-    Inst_DS__DS_CMPST_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_F32 class methods ---
-
-    Inst_DS__DS_MIN_F32::Inst_DS__DS_MIN_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_MIN_F32
-
-    Inst_DS__DS_MIN_F32::~Inst_DS__DS_MIN_F32()
-    {
-    } // ~Inst_DS__DS_MIN_F32
-
-    // --- description from .arch file ---
-    // 32b.
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
-    // Floating point minimum that handles NaN/INF/denormal values.
-    // Note that this opcode is slightly more general-purpose than
-    // ---  BUFFER_ATOMIC_FMIN.
-    void
-    Inst_DS__DS_MIN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_F32 class methods ---
-
-    Inst_DS__DS_MAX_F32::Inst_DS__DS_MAX_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_MAX_F32
-
-    Inst_DS__DS_MAX_F32::~Inst_DS__DS_MAX_F32()
-    {
-    } // ~Inst_DS__DS_MAX_F32
-
-    // --- description from .arch file ---
-    // 32b.
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
-    // Floating point maximum that handles NaN/INF/denormal values.
-    // Note that this opcode is slightly more general-purpose than
-    // ---  BUFFER_ATOMIC_FMAX.
-    void
-    Inst_DS__DS_MAX_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_NOP class methods ---
-
-    Inst_DS__DS_NOP::Inst_DS__DS_NOP(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_nop")
-    {
-        setFlag(Nop);
-    } // Inst_DS__DS_NOP
-
-    Inst_DS__DS_NOP::~Inst_DS__DS_NOP()
-    {
-    } // ~Inst_DS__DS_NOP
-
-    // --- description from .arch file ---
-    // Do nothing.
-    void
-    Inst_DS__DS_NOP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        gpuDynInst->wavefront()->decLGKMInstsIssued();
-    } // execute
-    // --- Inst_DS__DS_ADD_F32 class methods ---
-
-    Inst_DS__DS_ADD_F32::Inst_DS__DS_ADD_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_f32")
-    {
-        setFlag(F32);
-        setFlag(MemoryRef);
-        setFlag(GroupSegment);
-        setFlag(AtomicAdd);
-        setFlag(AtomicNoReturn);
-    } // Inst_DS__DS_ADD_F32
-
-    Inst_DS__DS_ADD_F32::~Inst_DS__DS_ADD_F32()
-    {
-    } // ~Inst_DS__DS_ADD_F32
-
-    // --- description from .arch file ---
-    // 32b:
-    // MEM[ADDR] += DATA;
-    // Floating point add that handles NaN/INF/denormal values.
-    void
-    Inst_DS__DS_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandF32 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemF32*>(gpuDynInst->a_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_ADD_F32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initAtomicAccess<VecElemF32>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_ADD_F32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_WRITE_B8 class methods ---
-
-    Inst_DS__DS_WRITE_B8::Inst_DS__DS_WRITE_B8(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_b8")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_B8
-
-    Inst_DS__DS_WRITE_B8::~Inst_DS__DS_WRITE_B8()
-    {
-    } // ~Inst_DS__DS_WRITE_B8
-
-    // --- description from .arch file ---
-    // MEM[ADDR] = DATA[7:0].
-    // Byte write.
-    void
-    Inst_DS__DS_WRITE_B8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU8 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU8*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE_B8::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemWrite<VecElemU8>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_WRITE_B8::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_WRITE_B8_D16_HI class methods ---
-
-    Inst_DS__DS_WRITE_B8_D16_HI::Inst_DS__DS_WRITE_B8_D16_HI(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_b8_d16_hi")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_B8_D16_HI
-
-    Inst_DS__DS_WRITE_B8_D16_HI::~Inst_DS__DS_WRITE_B8_D16_HI()
-    {
-    } // ~Inst_DS__DS_WRITE_B8_D16_HI
-
-    // --- description from .arch file ---
-    // MEM[ADDR] = DATA[23:16].
-    // Byte write in to high word.
-    void
-    Inst_DS__DS_WRITE_B8_D16_HI::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU8 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU8*>(gpuDynInst->d_data))[lane]
-                    = bits(data[lane], 23, 16);
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE_B8_D16_HI::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemWrite<VecElemU8>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_WRITE_B8_D16_HI::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_WRITE_B16 class methods ---
-
-    Inst_DS__DS_WRITE_B16::Inst_DS__DS_WRITE_B16(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_b16")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_B16
-
-    Inst_DS__DS_WRITE_B16::~Inst_DS__DS_WRITE_B16()
-    {
-    } // ~Inst_DS__DS_WRITE_B16
-
-    // --- description from .arch file ---
-    // MEM[ADDR] = DATA[15:0]
-    // Short write.
-    void
-    Inst_DS__DS_WRITE_B16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU16 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU16*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE_B16::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemWrite<VecElemU16>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_WRITE_B16::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_ADD_RTN_U32 class methods ---
-
-    Inst_DS__DS_ADD_RTN_U32::Inst_DS__DS_ADD_RTN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_rtn_u32")
-    {
-    } // Inst_DS__DS_ADD_RTN_U32
-
-    Inst_DS__DS_ADD_RTN_U32::~Inst_DS__DS_ADD_RTN_U32()
-    {
-    } // ~Inst_DS__DS_ADD_RTN_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_ADD_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_SUB_RTN_U32 class methods ---
-
-    Inst_DS__DS_SUB_RTN_U32::Inst_DS__DS_SUB_RTN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_sub_rtn_u32")
-    {
-    } // Inst_DS__DS_SUB_RTN_U32
-
-    Inst_DS__DS_SUB_RTN_U32::~Inst_DS__DS_SUB_RTN_U32()
-    {
-    } // ~Inst_DS__DS_SUB_RTN_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_SUB_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_RSUB_RTN_U32 class methods ---
-
-    Inst_DS__DS_RSUB_RTN_U32::Inst_DS__DS_RSUB_RTN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_rsub_rtn_u32")
-    {
-    } // Inst_DS__DS_RSUB_RTN_U32
-
-    Inst_DS__DS_RSUB_RTN_U32::~Inst_DS__DS_RSUB_RTN_U32()
-    {
-    } // ~Inst_DS__DS_RSUB_RTN_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA - MEM[ADDR];
-    // RETURN_DATA = tmp.
-    // Subtraction with reversed operands.
-    void
-    Inst_DS__DS_RSUB_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_INC_RTN_U32 class methods ---
-
-    Inst_DS__DS_INC_RTN_U32::Inst_DS__DS_INC_RTN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_inc_rtn_u32")
-    {
-    } // Inst_DS__DS_INC_RTN_U32
-
-    Inst_DS__DS_INC_RTN_U32::~Inst_DS__DS_INC_RTN_U32()
-    {
-    } // ~Inst_DS__DS_INC_RTN_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_INC_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_DEC_RTN_U32 class methods ---
-
-    Inst_DS__DS_DEC_RTN_U32::Inst_DS__DS_DEC_RTN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_dec_rtn_u32")
-    {
-    } // Inst_DS__DS_DEC_RTN_U32
-
-    Inst_DS__DS_DEC_RTN_U32::~Inst_DS__DS_DEC_RTN_U32()
-    {
-    } // ~Inst_DS__DS_DEC_RTN_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
-    // (unsigned compare); RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_DEC_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_RTN_I32 class methods ---
-
-    Inst_DS__DS_MIN_RTN_I32::Inst_DS__DS_MIN_RTN_I32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_rtn_i32")
-    {
-    } // Inst_DS__DS_MIN_RTN_I32
-
-    Inst_DS__DS_MIN_RTN_I32::~Inst_DS__DS_MIN_RTN_I32()
-    {
-    } // ~Inst_DS__DS_MIN_RTN_I32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MIN_RTN_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_RTN_I32 class methods ---
-
-    Inst_DS__DS_MAX_RTN_I32::Inst_DS__DS_MAX_RTN_I32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_rtn_i32")
-    {
-    } // Inst_DS__DS_MAX_RTN_I32
-
-    Inst_DS__DS_MAX_RTN_I32::~Inst_DS__DS_MAX_RTN_I32()
-    {
-    } // ~Inst_DS__DS_MAX_RTN_I32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MAX_RTN_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_RTN_U32 class methods ---
-
-    Inst_DS__DS_MIN_RTN_U32::Inst_DS__DS_MIN_RTN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_rtn_u32")
-    {
-    } // Inst_DS__DS_MIN_RTN_U32
-
-    Inst_DS__DS_MIN_RTN_U32::~Inst_DS__DS_MIN_RTN_U32()
-    {
-    } // ~Inst_DS__DS_MIN_RTN_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MIN_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_RTN_U32 class methods ---
-
-    Inst_DS__DS_MAX_RTN_U32::Inst_DS__DS_MAX_RTN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_rtn_u32")
-    {
-    } // Inst_DS__DS_MAX_RTN_U32
-
-    Inst_DS__DS_MAX_RTN_U32::~Inst_DS__DS_MAX_RTN_U32()
-    {
-    } // ~Inst_DS__DS_MAX_RTN_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MAX_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_AND_RTN_B32 class methods ---
-
-    Inst_DS__DS_AND_RTN_B32::Inst_DS__DS_AND_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_and_rtn_b32")
-    {
-    } // Inst_DS__DS_AND_RTN_B32
-
-    Inst_DS__DS_AND_RTN_B32::~Inst_DS__DS_AND_RTN_B32()
-    {
-    } // ~Inst_DS__DS_AND_RTN_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_AND_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_OR_RTN_B32 class methods ---
-
-    Inst_DS__DS_OR_RTN_B32::Inst_DS__DS_OR_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_or_rtn_b32")
-    {
-    } // Inst_DS__DS_OR_RTN_B32
-
-    Inst_DS__DS_OR_RTN_B32::~Inst_DS__DS_OR_RTN_B32()
-    {
-    } // ~Inst_DS__DS_OR_RTN_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_OR_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_XOR_RTN_B32 class methods ---
-
-    Inst_DS__DS_XOR_RTN_B32::Inst_DS__DS_XOR_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_xor_rtn_b32")
-    {
-    } // Inst_DS__DS_XOR_RTN_B32
-
-    Inst_DS__DS_XOR_RTN_B32::~Inst_DS__DS_XOR_RTN_B32()
-    {
-    } // ~Inst_DS__DS_XOR_RTN_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_XOR_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MSKOR_RTN_B32 class methods ---
-
-    Inst_DS__DS_MSKOR_RTN_B32::Inst_DS__DS_MSKOR_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_mskor_rtn_b32")
-    {
-    } // Inst_DS__DS_MSKOR_RTN_B32
-
-    Inst_DS__DS_MSKOR_RTN_B32::~Inst_DS__DS_MSKOR_RTN_B32()
-    {
-    } // ~Inst_DS__DS_MSKOR_RTN_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
-    // RETURN_DATA = tmp.
-    // Masked dword OR, D0 contains the mask and D1 contains the new value.
-    void
-    Inst_DS__DS_MSKOR_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRXCHG_RTN_B32 class methods ---
-
-    Inst_DS__DS_WRXCHG_RTN_B32::Inst_DS__DS_WRXCHG_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_wrxchg_rtn_b32")
-    {
-    } // Inst_DS__DS_WRXCHG_RTN_B32
-
-    Inst_DS__DS_WRXCHG_RTN_B32::~Inst_DS__DS_WRXCHG_RTN_B32()
-    {
-    } // ~Inst_DS__DS_WRXCHG_RTN_B32
-
-    // --- description from .arch file ---
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA;
-    // RETURN_DATA = tmp.
-    // Write-exchange operation.
-    void
-    Inst_DS__DS_WRXCHG_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRXCHG2_RTN_B32 class methods ---
-
-    Inst_DS__DS_WRXCHG2_RTN_B32::Inst_DS__DS_WRXCHG2_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_wrxchg2_rtn_b32")
-    {
-    } // Inst_DS__DS_WRXCHG2_RTN_B32
-
-    Inst_DS__DS_WRXCHG2_RTN_B32::~Inst_DS__DS_WRXCHG2_RTN_B32()
-    {
-    } // ~Inst_DS__DS_WRXCHG2_RTN_B32
-
-    // --- description from .arch file ---
-    // Write-exchange 2 separate dwords.
-    void
-    Inst_DS__DS_WRXCHG2_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRXCHG2ST64_RTN_B32 class methods ---
-
-    Inst_DS__DS_WRXCHG2ST64_RTN_B32::Inst_DS__DS_WRXCHG2ST64_RTN_B32(
-          InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_wrxchg2st64_rtn_b32")
-    {
-    } // Inst_DS__DS_WRXCHG2ST64_RTN_B32
-
-    Inst_DS__DS_WRXCHG2ST64_RTN_B32::~Inst_DS__DS_WRXCHG2ST64_RTN_B32()
-    {
-    } // ~Inst_DS__DS_WRXCHG2ST64_RTN_B32
-
-    // --- description from .arch file ---
-    // Write-exchange 2 separate dwords with a stride of 64 dwords.
-    void
-    Inst_DS__DS_WRXCHG2ST64_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_CMPST_RTN_B32 class methods ---
-
-    Inst_DS__DS_CMPST_RTN_B32::Inst_DS__DS_CMPST_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_rtn_b32")
-    {
-    } // Inst_DS__DS_CMPST_RTN_B32
-
-    Inst_DS__DS_CMPST_RTN_B32::~Inst_DS__DS_CMPST_RTN_B32()
-    {
-    } // ~Inst_DS__DS_CMPST_RTN_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    // Compare and store.
-    // Caution, the order of src and cmp are the *opposite* of the
-    // ---  BUFFER_ATOMIC_CMPSWAP opcode.
-    void
-    Inst_DS__DS_CMPST_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_CMPST_RTN_F32 class methods ---
-
-    Inst_DS__DS_CMPST_RTN_F32::Inst_DS__DS_CMPST_RTN_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_rtn_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_CMPST_RTN_F32
-
-    Inst_DS__DS_CMPST_RTN_F32::~Inst_DS__DS_CMPST_RTN_F32()
-    {
-    } // ~Inst_DS__DS_CMPST_RTN_F32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    // Floating point compare and store that handles NaN/INF/denormal values.
-    // Caution, the order of src and cmp are the *opposite* of the
-    // ---  BUFFER_ATOMIC_FCMPSWAP opcode.
-    void
-    Inst_DS__DS_CMPST_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_RTN_F32 class methods ---
-
-    Inst_DS__DS_MIN_RTN_F32::Inst_DS__DS_MIN_RTN_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_rtn_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_MIN_RTN_F32
-
-    Inst_DS__DS_MIN_RTN_F32::~Inst_DS__DS_MIN_RTN_F32()
-    {
-    } // ~Inst_DS__DS_MIN_RTN_F32
-
-    // --- description from .arch file ---
-    // 32b.
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
-    // Floating point minimum that handles NaN/INF/denormal values.
-    // Note that this opcode is slightly more general-purpose than
-    // ---  BUFFER_ATOMIC_FMIN.
-    void
-    Inst_DS__DS_MIN_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_RTN_F32 class methods ---
-
-    Inst_DS__DS_MAX_RTN_F32::Inst_DS__DS_MAX_RTN_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_rtn_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_MAX_RTN_F32
-
-    Inst_DS__DS_MAX_RTN_F32::~Inst_DS__DS_MAX_RTN_F32()
-    {
-    } // ~Inst_DS__DS_MAX_RTN_F32
-
-    // --- description from .arch file ---
-    // 32b.
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
-    // Floating point maximum that handles NaN/INF/denormal values.
-    // Note that this opcode is slightly more general-purpose than
-    // ---  BUFFER_ATOMIC_FMAX.
-    void
-    Inst_DS__DS_MAX_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRAP_RTN_B32 class methods ---
-
-    Inst_DS__DS_WRAP_RTN_B32::Inst_DS__DS_WRAP_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_wrap_rtn_b32")
-    {
-    } // Inst_DS__DS_WRAP_RTN_B32
-
-    Inst_DS__DS_WRAP_RTN_B32::~Inst_DS__DS_WRAP_RTN_B32()
-    {
-    } // ~Inst_DS__DS_WRAP_RTN_B32
-
-    // --- description from .arch file ---
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA) ? tmp - DATA : tmp + DATA2;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_WRAP_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_ADD_RTN_F32 class methods ---
-
-    Inst_DS__DS_ADD_RTN_F32::Inst_DS__DS_ADD_RTN_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_rtn_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_ADD_RTN_F32
-
-    Inst_DS__DS_ADD_RTN_F32::~Inst_DS__DS_ADD_RTN_F32()
-    {
-    } // ~Inst_DS__DS_ADD_RTN_F32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA;
-    // RETURN_DATA = tmp.
-    // Floating point add that handles NaN/INF/denormal values.
-    void
-    Inst_DS__DS_ADD_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_READ_B32 class methods ---
-
-    Inst_DS__DS_READ_B32::Inst_DS__DS_READ_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_B32
-
-    Inst_DS__DS_READ_B32::~Inst_DS__DS_READ_B32()
-    {
-    } // ~Inst_DS__DS_READ_B32
-
-    // --- description from .arch file ---
-    // RETURN_DATA = MEM[ADDR].
-    // Dword read.
-    void
-    Inst_DS__DS_READ_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemRead<VecElemU32>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ_B32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane];
-            }
-        }
-
-        vdst.write();
-    } // completeAcc
-    // --- Inst_DS__DS_READ2_B32 class methods ---
-
-    Inst_DS__DS_READ2_B32::Inst_DS__DS_READ2_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read2_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ2_B32
-
-    Inst_DS__DS_READ2_B32::~Inst_DS__DS_READ2_B32()
-    {
-    } // ~Inst_DS__DS_READ2_B32
-
-    // --- description from .arch file ---
-    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 4];
-    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 4].
-    // Read 2 dwords.
-    void
-    Inst_DS__DS_READ2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ2_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0 * 4;
-        Addr offset1 = instData.OFFSET1 * 4;
-
-        initDualMemRead<VecElemU32>(gpuDynInst, offset0, offset1);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ2_B32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 2];
-                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 2 + 1];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-    } // completeAcc
-    // --- Inst_DS__DS_READ2ST64_B32 class methods ---
-
-    Inst_DS__DS_READ2ST64_B32::Inst_DS__DS_READ2ST64_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read2st64_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ2ST64_B32
-
-    Inst_DS__DS_READ2ST64_B32::~Inst_DS__DS_READ2ST64_B32()
-    {
-    } // ~Inst_DS__DS_READ2ST64_B32
-
-    // --- description from .arch file ---
-    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 4 * 64];
-    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 4 * 64].
-    // Read 2 dwords.
-    void
-    Inst_DS__DS_READ2ST64_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ2ST64_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = (instData.OFFSET0 * 4 * 64);
-        Addr offset1 = (instData.OFFSET1 * 4 * 64);
-
-        initDualMemRead<VecElemU32>(gpuDynInst, offset0, offset1);
-    }
-
-    void
-    Inst_DS__DS_READ2ST64_B32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 2];
-                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 2 + 1];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-    }
-    // --- Inst_DS__DS_READ_I8 class methods ---
-
-    Inst_DS__DS_READ_I8::Inst_DS__DS_READ_I8(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_i8")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_I8
-
-    Inst_DS__DS_READ_I8::~Inst_DS__DS_READ_I8()
-    {
-    } // ~Inst_DS__DS_READ_I8
-
-    // --- description from .arch file ---
-    // RETURN_DATA = signext(MEM[ADDR][7:0]).
-    // Signed byte read.
-    void
-    Inst_DS__DS_READ_I8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ_I8::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemRead<VecElemI8>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ_I8::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (VecElemU32)sext<8>((reinterpret_cast<VecElemI8*>(
-                    gpuDynInst->d_data))[lane]);
-            }
-        }
-
-        vdst.write();
-    } // completeAcc
-    // --- Inst_DS__DS_READ_U8 class methods ---
-
-    Inst_DS__DS_READ_U8::Inst_DS__DS_READ_U8(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_u8")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_U8
-
-    Inst_DS__DS_READ_U8::~Inst_DS__DS_READ_U8()
-    {
-    } // ~Inst_DS__DS_READ_U8
-
-    // --- description from .arch file ---
-    // RETURN_DATA = {24'h0,MEM[ADDR][7:0]}.
-    // Unsigned byte read.
-    void
-    Inst_DS__DS_READ_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ_U8::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemRead<VecElemU8>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ_U8::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (VecElemU32)(reinterpret_cast<VecElemU8*>(
-                    gpuDynInst->d_data))[lane];
-            }
-        }
-
-        vdst.write();
-    } // completeAcc
-    // --- Inst_DS__DS_READ_I16 class methods ---
-
-    Inst_DS__DS_READ_I16::Inst_DS__DS_READ_I16(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_i16")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_I16
-
-    Inst_DS__DS_READ_I16::~Inst_DS__DS_READ_I16()
-    {
-    } // ~Inst_DS__DS_READ_I16
-
-    // --- description from .arch file ---
-    // RETURN_DATA = signext(MEM[ADDR][15:0]).
-    // Signed short read.
-    void
-    Inst_DS__DS_READ_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_READ_U16 class methods ---
-
-    Inst_DS__DS_READ_U16::Inst_DS__DS_READ_U16(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_u16")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_U16
-
-    Inst_DS__DS_READ_U16::~Inst_DS__DS_READ_U16()
-    {
-    } // ~Inst_DS__DS_READ_U16
-
-    // --- description from .arch file ---
-    // RETURN_DATA = {16'h0,MEM[ADDR][15:0]}.
-    // Unsigned short read.
-    void
-    Inst_DS__DS_READ_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-    void
-    Inst_DS__DS_READ_U16::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemRead<VecElemU16>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ_U16::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (VecElemU32)(reinterpret_cast<VecElemU16*>(
-                    gpuDynInst->d_data))[lane];
-            }
-        }
-
-        vdst.write();
-    } // completeAcc
-    // --- Inst_DS__DS_SWIZZLE_B32 class methods ---
-
-    Inst_DS__DS_SWIZZLE_B32::Inst_DS__DS_SWIZZLE_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_swizzle_b32")
-    {
-        /**
-         * While this operation doesn't actually use DS storage we classify
-         * it as a load here because it does a writeback to a VGPR, which
-         * fits in better with the LDS pipeline logic.
-         */
-         setFlag(Load);
-    } // Inst_DS__DS_SWIZZLE_B32
-
-    Inst_DS__DS_SWIZZLE_B32::~Inst_DS__DS_SWIZZLE_B32()
-    {
-    } // ~Inst_DS__DS_SWIZZLE_B32
-
-    // --- description from .arch file ---
-    // RETURN_DATA = swizzle(vgpr_data, offset1:offset0).
-    // Dword swizzle, no data is written to LDS memory; See ds_opcodes.docx for
-    // ---  details.
-    void
-    Inst_DS__DS_SWIZZLE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        wf->decLGKMInstsIssued();
-
-        if (gpuDynInst->exec_mask.none()) {
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()
-                                ->cyclesToTicks(Cycles(24)));
-
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-        /**
-         * The "DS pattern" is comprised of both offset fields. That is, the
-         * swizzle pattern between lanes. Bit 15 of the DS pattern dictates
-         * which swizzle mode to use. There are two different swizzle
-         * patterns: 1) QDMode and 2) Bit-masks mode. If bit 15 is set use
-         * QDMode else use Bit-masks mode. The remaining bits dictate how to
-         * swizzle the lanes.
-         *
-         * QDMode:      Chunks the lanes into 4s and swizzles among them.
-         *              Bits 7:6 dictate where lane 3 (of the current chunk)
-         *              gets its date, 5:4 lane 2, etc.
-         *
-         * Bit-mask:    This mode breaks bits 14:0 into 3 equal-sized chunks.
-         *              14:10 is the xor_mask, 9:5 is the or_mask, and 4:0
-         *              is the and_mask. Each lane is swizzled by performing
-         *              the appropriate operation using these masks.
-         */
-        VecElemU16 ds_pattern = ((instData.OFFSET1 << 8) | instData.OFFSET0);
-
-        data.read();
-
-        if (bits(ds_pattern, 15)) {
-            // QDMode
-            for (int lane = 0; lane < NumVecElemPerVecReg; lane += 4) {
-                /**
-                 * This operation allows data sharing between groups
-                 * of four consecutive threads. Note the increment by
-                 * 4 in the for loop.
-                 */
-                if (gpuDynInst->exec_mask[lane]) {
-                    int index0 = lane + bits(ds_pattern, 1, 0);
-                    panic_if(index0 >= NumVecElemPerVecReg, "%s: index0 (%d) "
-                             "is out of bounds.\n", gpuDynInst->disassemble(),
-                             index0);
-                    vdst[lane]
-                        = gpuDynInst->exec_mask[index0] ? data[index0]: 0;
-                }
-                if (gpuDynInst->exec_mask[lane + 1]) {
-                    int index1 = lane + bits(ds_pattern, 3, 2);
-                    panic_if(index1 >= NumVecElemPerVecReg, "%s: index1 (%d) "
-                             "is out of bounds.\n", gpuDynInst->disassemble(),
-                             index1);
-                    vdst[lane + 1]
-                        = gpuDynInst->exec_mask[index1] ? data[index1]: 0;
-                }
-                if (gpuDynInst->exec_mask[lane + 2]) {
-                    int index2 = lane + bits(ds_pattern, 5, 4);
-                    panic_if(index2 >= NumVecElemPerVecReg, "%s: index2 (%d) "
-                             "is out of bounds.\n", gpuDynInst->disassemble(),
-                             index2);
-                    vdst[lane + 2]
-                        = gpuDynInst->exec_mask[index2] ? data[index2]: 0;
-                }
-                if (gpuDynInst->exec_mask[lane + 3]) {
-                    int index3 = lane + bits(ds_pattern, 7, 6);
-                    panic_if(index3 >= NumVecElemPerVecReg, "%s: index3 (%d) "
-                             "is out of bounds.\n", gpuDynInst->disassemble(),
-                             index3);
-                    vdst[lane + 3]
-                        = gpuDynInst->exec_mask[index3] ? data[index3]: 0;
-                }
-            }
-        } else {
-            // Bit Mode
-            int and_mask = bits(ds_pattern, 4, 0);
-            int or_mask = bits(ds_pattern, 9, 5);
-            int xor_mask = bits(ds_pattern, 14, 10);
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    int index = (((lane & and_mask) | or_mask) ^ xor_mask);
-                    // Adjust for the next 32 lanes.
-                    if (lane > 31) {
-                        index += 32;
-                    }
-                    panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is "
-                             "out of bounds.\n", gpuDynInst->disassemble(),
-                             index);
-                    vdst[lane]
-                        = gpuDynInst->exec_mask[index] ? data[index] : 0;
-                }
-            }
-        }
-
-        vdst.write();
-
-        /**
-         * This is needed because we treat this instruction as a load
-         * but it's not an actual memory request.
-         * Without this, the destination register never gets marked as
-         * free, leading to a  possible deadlock
-         */
-        wf->computeUnit->vrf[wf->simdId]->
-            scheduleWriteOperandsFromLoad(wf, gpuDynInst);
-        /**
-         * Similarly, this counter could build up over time, even across
-         * multiple wavefronts, and cause a deadlock.
-         */
-        wf->rdLmReqsInPipe--;
-    } // execute
-    // --- Inst_DS__DS_PERMUTE_B32 class methods ---
-
-    Inst_DS__DS_PERMUTE_B32::Inst_DS__DS_PERMUTE_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_permute_b32")
-    {
-        setFlag(MemoryRef);
-        /**
-         * While this operation doesn't actually use DS storage we classify
-         * it as a load here because it does a writeback to a VGPR, which
-         * fits in better with the LDS pipeline logic.
-         */
-         setFlag(Load);
-    } // Inst_DS__DS_PERMUTE_B32
-
-    Inst_DS__DS_PERMUTE_B32::~Inst_DS__DS_PERMUTE_B32()
-    {
-    } // ~Inst_DS__DS_PERMUTE_B32
-
-    // --- description from .arch file ---
-    // Forward permute.
-    void
-    Inst_DS__DS_PERMUTE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        wf->decLGKMInstsIssued();
-
-        if (gpuDynInst->exec_mask.none()) {
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()
-                                ->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        addr.read();
-        data.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                /**
-                 * One of the offset fields can be used for the index.
-                 * It is assumed OFFSET0 would be used, as OFFSET1 is
-                 * typically only used for DS ops that operate on two
-                 * disparate pieces of data.
-                 */
-                assert(!instData.OFFSET1);
-                /**
-                 * The address provided is a byte address, but VGPRs are
-                 * 4 bytes, so we must divide by 4 to get the actual VGPR
-                 * index. Additionally, the index is calculated modulo the
-                 * WF size, 64 in this case, so we simply extract bits 7-2.
-                 */
-                int index = bits(addr[lane] + instData.OFFSET0, 7, 2);
-                panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is out "
-                         "of bounds.\n", gpuDynInst->disassemble(), index);
-                /**
-                 * If the shuffled index corresponds to a lane that is
-                 * inactive then this instruction writes a 0 to the active
-                 * lane in VDST.
-                 */
-                if (wf->execMask(index)) {
-                    vdst[index] = data[lane];
-                } else {
-                    vdst[index] = 0;
-                }
-            }
-        }
-
-        vdst.write();
-
-        /**
-         * This is needed because we treat this instruction as a load
-         * but it's not an actual memory request.
-         * Without this, the destination register never gets marked as
-         * free, leading to a  possible deadlock
-         */
-        wf->computeUnit->vrf[wf->simdId]->
-            scheduleWriteOperandsFromLoad(wf, gpuDynInst);
-        /**
-         * Similarly, this counter could build up over time, even across
-         * multiple wavefronts, and cause a deadlock.
-         */
-        wf->rdLmReqsInPipe--;
-    } // execute
-    // --- Inst_DS__DS_BPERMUTE_B32 class methods ---
-
-    Inst_DS__DS_BPERMUTE_B32::Inst_DS__DS_BPERMUTE_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_bpermute_b32")
-    {
-        setFlag(MemoryRef);
-        /**
-         * While this operation doesn't actually use DS storage we classify
-         * it as a load here because it does a writeback to a VGPR, which
-         * fits in better with the LDS pipeline logic.
-         */
-        setFlag(Load);
-    } // Inst_DS__DS_BPERMUTE_B32
-
-    Inst_DS__DS_BPERMUTE_B32::~Inst_DS__DS_BPERMUTE_B32()
-    {
-    } // ~Inst_DS__DS_BPERMUTE_B32
-
-    // --- description from .arch file ---
-    // Backward permute.
-    void
-    Inst_DS__DS_BPERMUTE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        wf->decLGKMInstsIssued();
-
-        if (gpuDynInst->exec_mask.none()) {
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()
-                                ->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        addr.read();
-        data.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                /**
-                 * One of the offset fields can be used for the index.
-                 * It is assumed OFFSET0 would be used, as OFFSET1 is
-                 * typically only used for DS ops that operate on two
-                 * disparate pieces of data.
-                 */
-                assert(!instData.OFFSET1);
-                /**
-                 * The address provided is a byte address, but VGPRs are
-                 * 4 bytes, so we must divide by 4 to get the actual VGPR
-                 * index. Additionally, the index is calculated modulo the
-                 * WF size, 64 in this case, so we simply extract bits 7-2.
-                 */
-                int index = bits(addr[lane] + instData.OFFSET0, 7, 2);
-                panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is out "
-                         "of bounds.\n", gpuDynInst->disassemble(), index);
-                /**
-                 * If the shuffled index corresponds to a lane that is
-                 * inactive then this instruction writes a 0 to the active
-                 * lane in VDST.
-                 */
-                if (wf->execMask(index)) {
-                    vdst[lane] = data[index];
-                } else {
-                    vdst[lane] = 0;
-                }
-            }
-        }
-
-        vdst.write();
-
-        /**
-         * This is needed because we treat this instruction as a load
-         * but it's not an actual memory request.
-         * Without this, the destination register never gets marked as
-         * free, leading to a  possible deadlock
-         */
-        wf->computeUnit->vrf[wf->simdId]->
-            scheduleWriteOperandsFromLoad(wf, gpuDynInst);
-        /**
-         * Similarly, this counter could build up over time, even across
-         * multiple wavefronts, and cause a deadlock.
-         */
-        wf->rdLmReqsInPipe--;
-    } // execute
-
-    // --- Inst_DS__DS_ADD_U64 class methods ---
-
-    Inst_DS__DS_ADD_U64::Inst_DS__DS_ADD_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_u64")
-    {
-        setFlag(MemoryRef);
-        setFlag(GroupSegment);
-        setFlag(AtomicAdd);
-        setFlag(AtomicNoReturn);
-    } // Inst_DS__DS_ADD_U64
-
-    Inst_DS__DS_ADD_U64::~Inst_DS__DS_ADD_U64()
-    {
-    } // ~Inst_DS__DS_ADD_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // MEM[ADDR] += DATA[0:1];
-    void
-    Inst_DS__DS_ADD_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU64 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU64*>(gpuDynInst->a_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_ADD_U64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initAtomicAccess<VecElemU64>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_ADD_U64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_SUB_U64 class methods ---
-
-    Inst_DS__DS_SUB_U64::Inst_DS__DS_SUB_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_sub_u64")
-    {
-    } // Inst_DS__DS_SUB_U64
-
-    Inst_DS__DS_SUB_U64::~Inst_DS__DS_SUB_U64()
-    {
-    } // ~Inst_DS__DS_SUB_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_SUB_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_RSUB_U64 class methods ---
-
-    Inst_DS__DS_RSUB_U64::Inst_DS__DS_RSUB_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_rsub_u64")
-    {
-    } // Inst_DS__DS_RSUB_U64
-
-    Inst_DS__DS_RSUB_U64::~Inst_DS__DS_RSUB_U64()
-    {
-    } // ~Inst_DS__DS_RSUB_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA - MEM[ADDR];
-    // RETURN_DATA = tmp.
-    // Subtraction with reversed operands.
-    void
-    Inst_DS__DS_RSUB_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_INC_U64 class methods ---
-
-    Inst_DS__DS_INC_U64::Inst_DS__DS_INC_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_inc_u64")
-    {
-    } // Inst_DS__DS_INC_U64
-
-    Inst_DS__DS_INC_U64::~Inst_DS__DS_INC_U64()
-    {
-    } // ~Inst_DS__DS_INC_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_INC_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_DEC_U64 class methods ---
-
-    Inst_DS__DS_DEC_U64::Inst_DS__DS_DEC_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_dec_u64")
-    {
-    } // Inst_DS__DS_DEC_U64
-
-    Inst_DS__DS_DEC_U64::~Inst_DS__DS_DEC_U64()
-    {
-    } // ~Inst_DS__DS_DEC_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
-    // (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_DEC_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_I64 class methods ---
-
-    Inst_DS__DS_MIN_I64::Inst_DS__DS_MIN_I64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_i64")
-    {
-    } // Inst_DS__DS_MIN_I64
-
-    Inst_DS__DS_MIN_I64::~Inst_DS__DS_MIN_I64()
-    {
-    } // ~Inst_DS__DS_MIN_I64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MIN_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_I64 class methods ---
-
-    Inst_DS__DS_MAX_I64::Inst_DS__DS_MAX_I64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_i64")
-    {
-    } // Inst_DS__DS_MAX_I64
-
-    Inst_DS__DS_MAX_I64::~Inst_DS__DS_MAX_I64()
-    {
-    } // ~Inst_DS__DS_MAX_I64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MAX_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_U64 class methods ---
-
-    Inst_DS__DS_MIN_U64::Inst_DS__DS_MIN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_u64")
-    {
-    } // Inst_DS__DS_MIN_U64
-
-    Inst_DS__DS_MIN_U64::~Inst_DS__DS_MIN_U64()
-    {
-    } // ~Inst_DS__DS_MIN_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MIN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_U64 class methods ---
-
-    Inst_DS__DS_MAX_U64::Inst_DS__DS_MAX_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_u64")
-    {
-    } // Inst_DS__DS_MAX_U64
-
-    Inst_DS__DS_MAX_U64::~Inst_DS__DS_MAX_U64()
-    {
-    } // ~Inst_DS__DS_MAX_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MAX_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_AND_B64 class methods ---
-
-    Inst_DS__DS_AND_B64::Inst_DS__DS_AND_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_and_b64")
-    {
-    } // Inst_DS__DS_AND_B64
-
-    Inst_DS__DS_AND_B64::~Inst_DS__DS_AND_B64()
-    {
-    } // ~Inst_DS__DS_AND_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_AND_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_OR_B64 class methods ---
-
-    Inst_DS__DS_OR_B64::Inst_DS__DS_OR_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_or_b64")
-    {
-    } // Inst_DS__DS_OR_B64
-
-    Inst_DS__DS_OR_B64::~Inst_DS__DS_OR_B64()
-    {
-    } // ~Inst_DS__DS_OR_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_OR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_XOR_B64 class methods ---
-
-    Inst_DS__DS_XOR_B64::Inst_DS__DS_XOR_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_xor_b64")
-    {
-    } // Inst_DS__DS_XOR_B64
-
-    Inst_DS__DS_XOR_B64::~Inst_DS__DS_XOR_B64()
-    {
-    } // ~Inst_DS__DS_XOR_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_XOR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MSKOR_B64 class methods ---
-
-    Inst_DS__DS_MSKOR_B64::Inst_DS__DS_MSKOR_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_mskor_b64")
-    {
-    } // Inst_DS__DS_MSKOR_B64
-
-    Inst_DS__DS_MSKOR_B64::~Inst_DS__DS_MSKOR_B64()
-    {
-    } // ~Inst_DS__DS_MSKOR_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
-    // RETURN_DATA = tmp.
-    // Masked dword OR, D0 contains the mask and D1 contains the new value.
-    void
-    Inst_DS__DS_MSKOR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRITE_B64 class methods ---
-
-    Inst_DS__DS_WRITE_B64::Inst_DS__DS_WRITE_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_b64")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_B64
-
-    Inst_DS__DS_WRITE_B64::~Inst_DS__DS_WRITE_B64()
-    {
-    } // ~Inst_DS__DS_WRITE_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // MEM[ADDR] = DATA.
-    // Write qword.
-    void
-    Inst_DS__DS_WRITE_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU64 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU64*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemWrite<VecElemU64>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_WRITE_B64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_WRITE2_B64 class methods ---
-
-    Inst_DS__DS_WRITE2_B64::Inst_DS__DS_WRITE2_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write2_b64")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE2_B64
-
-    Inst_DS__DS_WRITE2_B64::~Inst_DS__DS_WRITE2_B64()
-    {
-    } // ~Inst_DS__DS_WRITE2_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // MEM[ADDR_BASE + OFFSET0 * 8] = DATA;
-    // MEM[ADDR_BASE + OFFSET1 * 8] = DATA2.
-    // Write 2 qwords.
-    void
-    Inst_DS__DS_WRITE2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU64 data0(gpuDynInst, extData.DATA0);
-        ConstVecOperandU64 data1(gpuDynInst, extData.DATA1);
-
-        addr.read();
-        data0.read();
-        data1.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2] = data0[lane];
-                (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE2_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0 * 8;
-        Addr offset1 = instData.OFFSET1 * 8;
-
-        initDualMemWrite<VecElemU64>(gpuDynInst, offset0, offset1);
-    }
-
-    void
-    Inst_DS__DS_WRITE2_B64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-    // --- Inst_DS__DS_WRITE2ST64_B64 class methods ---
-
-    Inst_DS__DS_WRITE2ST64_B64::Inst_DS__DS_WRITE2ST64_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write2st64_b64")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE2ST64_B64
-
-    Inst_DS__DS_WRITE2ST64_B64::~Inst_DS__DS_WRITE2ST64_B64()
-    {
-    } // ~Inst_DS__DS_WRITE2ST64_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // MEM[ADDR_BASE + OFFSET0 * 8 * 64] = DATA;
-    // MEM[ADDR_BASE + OFFSET1 * 8 * 64] = DATA2;
-    // Write 2 qwords.
-    void
-    Inst_DS__DS_WRITE2ST64_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU64 data0(gpuDynInst, extData.DATA0);
-        ConstVecOperandU64 data1(gpuDynInst, extData.DATA1);
-
-        addr.read();
-        data0.read();
-        data1.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2] = data0[lane];
-                (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE2ST64_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0 * 8 * 64;
-        Addr offset1 = instData.OFFSET1 * 8 * 64;
-
-        initDualMemWrite<VecElemU64>(gpuDynInst, offset0, offset1);
-    }
-
-    void
-    Inst_DS__DS_WRITE2ST64_B64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-    // --- Inst_DS__DS_CMPST_B64 class methods ---
-
-    Inst_DS__DS_CMPST_B64::Inst_DS__DS_CMPST_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_b64")
-    {
-    } // Inst_DS__DS_CMPST_B64
-
-    Inst_DS__DS_CMPST_B64::~Inst_DS__DS_CMPST_B64()
-    {
-    } // ~Inst_DS__DS_CMPST_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    // Compare and store.
-    // Caution, the order of src and cmp are the *opposite* of the
-    // ---  BUFFER_ATOMIC_CMPSWAP_X2 opcode.
-    void
-    Inst_DS__DS_CMPST_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_CMPST_F64 class methods ---
-
-    Inst_DS__DS_CMPST_F64::Inst_DS__DS_CMPST_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_CMPST_F64
-
-    Inst_DS__DS_CMPST_F64::~Inst_DS__DS_CMPST_F64()
-    {
-    } // ~Inst_DS__DS_CMPST_F64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    // Floating point compare and store that handles NaN/INF/denormal values.
-    // Caution, the order of src and cmp are the *opposite* of the
-    // ---  BUFFER_ATOMIC_FCMPSWAP_X2 opcode.
-    void
-    Inst_DS__DS_CMPST_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_F64 class methods ---
-
-    Inst_DS__DS_MIN_F64::Inst_DS__DS_MIN_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_MIN_F64
-
-    Inst_DS__DS_MIN_F64::~Inst_DS__DS_MIN_F64()
-    {
-    } // ~Inst_DS__DS_MIN_F64
-
-    // --- description from .arch file ---
-    // 64b.
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
-    // Floating point minimum that handles NaN/INF/denormal values.
-    // Note that this opcode is slightly more general-purpose than
-    // ---  BUFFER_ATOMIC_FMIN_X2.
-    void
-    Inst_DS__DS_MIN_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_F64 class methods ---
-
-    Inst_DS__DS_MAX_F64::Inst_DS__DS_MAX_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_MAX_F64
-
-    Inst_DS__DS_MAX_F64::~Inst_DS__DS_MAX_F64()
-    {
-    } // ~Inst_DS__DS_MAX_F64
-
-    // --- description from .arch file ---
-    // 64b.
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
-    // Floating point maximum that handles NaN/INF/denormal values.
-    // Note that this opcode is slightly more general-purpose than
-    // ---  BUFFER_ATOMIC_FMAX_X2.
-    void
-    Inst_DS__DS_MAX_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_ADD_RTN_U64 class methods ---
-
-    Inst_DS__DS_ADD_RTN_U64::Inst_DS__DS_ADD_RTN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_rtn_u64")
-    {
-    } // Inst_DS__DS_ADD_RTN_U64
-
-    Inst_DS__DS_ADD_RTN_U64::~Inst_DS__DS_ADD_RTN_U64()
-    {
-    } // ~Inst_DS__DS_ADD_RTN_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_ADD_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_SUB_RTN_U64 class methods ---
-
-    Inst_DS__DS_SUB_RTN_U64::Inst_DS__DS_SUB_RTN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_sub_rtn_u64")
-    {
-    } // Inst_DS__DS_SUB_RTN_U64
-
-    Inst_DS__DS_SUB_RTN_U64::~Inst_DS__DS_SUB_RTN_U64()
-    {
-    } // ~Inst_DS__DS_SUB_RTN_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_SUB_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_RSUB_RTN_U64 class methods ---
-
-    Inst_DS__DS_RSUB_RTN_U64::Inst_DS__DS_RSUB_RTN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_rsub_rtn_u64")
-    {
-    } // Inst_DS__DS_RSUB_RTN_U64
-
-    Inst_DS__DS_RSUB_RTN_U64::~Inst_DS__DS_RSUB_RTN_U64()
-    {
-    } // ~Inst_DS__DS_RSUB_RTN_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA - MEM[ADDR];
-    // RETURN_DATA = tmp.
-    // Subtraction with reversed operands.
-    void
-    Inst_DS__DS_RSUB_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_INC_RTN_U64 class methods ---
-
-    Inst_DS__DS_INC_RTN_U64::Inst_DS__DS_INC_RTN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_inc_rtn_u64")
-    {
-    } // Inst_DS__DS_INC_RTN_U64
-
-    Inst_DS__DS_INC_RTN_U64::~Inst_DS__DS_INC_RTN_U64()
-    {
-    } // ~Inst_DS__DS_INC_RTN_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_INC_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_DEC_RTN_U64 class methods ---
-
-    Inst_DS__DS_DEC_RTN_U64::Inst_DS__DS_DEC_RTN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_dec_rtn_u64")
-    {
-    } // Inst_DS__DS_DEC_RTN_U64
-
-    Inst_DS__DS_DEC_RTN_U64::~Inst_DS__DS_DEC_RTN_U64()
-    {
-    } // ~Inst_DS__DS_DEC_RTN_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
-    // (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_DEC_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_RTN_I64 class methods ---
-
-    Inst_DS__DS_MIN_RTN_I64::Inst_DS__DS_MIN_RTN_I64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_rtn_i64")
-    {
-    } // Inst_DS__DS_MIN_RTN_I64
-
-    Inst_DS__DS_MIN_RTN_I64::~Inst_DS__DS_MIN_RTN_I64()
-    {
-    } // ~Inst_DS__DS_MIN_RTN_I64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MIN_RTN_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_RTN_I64 class methods ---
-
-    Inst_DS__DS_MAX_RTN_I64::Inst_DS__DS_MAX_RTN_I64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_rtn_i64")
-    {
-    } // Inst_DS__DS_MAX_RTN_I64
-
-    Inst_DS__DS_MAX_RTN_I64::~Inst_DS__DS_MAX_RTN_I64()
-    {
-    } // ~Inst_DS__DS_MAX_RTN_I64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MAX_RTN_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_RTN_U64 class methods ---
-
-    Inst_DS__DS_MIN_RTN_U64::Inst_DS__DS_MIN_RTN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_rtn_u64")
-    {
-    } // Inst_DS__DS_MIN_RTN_U64
-
-    Inst_DS__DS_MIN_RTN_U64::~Inst_DS__DS_MIN_RTN_U64()
-    {
-    } // ~Inst_DS__DS_MIN_RTN_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MIN_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_RTN_U64 class methods ---
-
-    Inst_DS__DS_MAX_RTN_U64::Inst_DS__DS_MAX_RTN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_rtn_u64")
-    {
-    } // Inst_DS__DS_MAX_RTN_U64
-
-    Inst_DS__DS_MAX_RTN_U64::~Inst_DS__DS_MAX_RTN_U64()
-    {
-    } // ~Inst_DS__DS_MAX_RTN_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MAX_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_AND_RTN_B64 class methods ---
-
-    Inst_DS__DS_AND_RTN_B64::Inst_DS__DS_AND_RTN_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_and_rtn_b64")
-    {
-    } // Inst_DS__DS_AND_RTN_B64
-
-    Inst_DS__DS_AND_RTN_B64::~Inst_DS__DS_AND_RTN_B64()
-    {
-    } // ~Inst_DS__DS_AND_RTN_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_AND_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_OR_RTN_B64 class methods ---
-
-    Inst_DS__DS_OR_RTN_B64::Inst_DS__DS_OR_RTN_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_or_rtn_b64")
-    {
-    } // Inst_DS__DS_OR_RTN_B64
-
-    Inst_DS__DS_OR_RTN_B64::~Inst_DS__DS_OR_RTN_B64()
-    {
-    } // ~Inst_DS__DS_OR_RTN_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_OR_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_XOR_RTN_B64 class methods ---
-
-    Inst_DS__DS_XOR_RTN_B64::Inst_DS__DS_XOR_RTN_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_xor_rtn_b64")
-    {
-    } // Inst_DS__DS_XOR_RTN_B64
-
-    Inst_DS__DS_XOR_RTN_B64::~Inst_DS__DS_XOR_RTN_B64()
-    {
-    } // ~Inst_DS__DS_XOR_RTN_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_XOR_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MSKOR_RTN_B64 class methods ---
-
-    Inst_DS__DS_MSKOR_RTN_B64::Inst_DS__DS_MSKOR_RTN_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_mskor_rtn_b64")
-    {
-    } // Inst_DS__DS_MSKOR_RTN_B64
-
-    Inst_DS__DS_MSKOR_RTN_B64::~Inst_DS__DS_MSKOR_RTN_B64()
-    {
-    } // ~Inst_DS__DS_MSKOR_RTN_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
-    // RETURN_DATA = tmp.
-    // Masked dword OR, D0 contains the mask and D1 contains the new value.
-    void
-    Inst_DS__DS_MSKOR_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRXCHG_RTN_B64 class methods ---
-
-    Inst_DS__DS_WRXCHG_RTN_B64::Inst_DS__DS_WRXCHG_RTN_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_wrxchg_rtn_b64")
-    {
-    } // Inst_DS__DS_WRXCHG_RTN_B64
-
-    Inst_DS__DS_WRXCHG_RTN_B64::~Inst_DS__DS_WRXCHG_RTN_B64()
-    {
-    } // ~Inst_DS__DS_WRXCHG_RTN_B64
-
-    // --- description from .arch file ---
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA;
-    // RETURN_DATA = tmp.
-    // Write-exchange operation.
-    void
-    Inst_DS__DS_WRXCHG_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRXCHG2_RTN_B64 class methods ---
-
-    Inst_DS__DS_WRXCHG2_RTN_B64::Inst_DS__DS_WRXCHG2_RTN_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_wrxchg2_rtn_b64")
-    {
-    } // Inst_DS__DS_WRXCHG2_RTN_B64
-
-    Inst_DS__DS_WRXCHG2_RTN_B64::~Inst_DS__DS_WRXCHG2_RTN_B64()
-    {
-    } // ~Inst_DS__DS_WRXCHG2_RTN_B64
-
-    // --- description from .arch file ---
-    // Write-exchange 2 separate qwords.
-    void
-    Inst_DS__DS_WRXCHG2_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRXCHG2ST64_RTN_B64 class methods ---
-
-    Inst_DS__DS_WRXCHG2ST64_RTN_B64::Inst_DS__DS_WRXCHG2ST64_RTN_B64(
-          InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_wrxchg2st64_rtn_b64")
-    {
-    } // Inst_DS__DS_WRXCHG2ST64_RTN_B64
-
-    Inst_DS__DS_WRXCHG2ST64_RTN_B64::~Inst_DS__DS_WRXCHG2ST64_RTN_B64()
-    {
-    } // ~Inst_DS__DS_WRXCHG2ST64_RTN_B64
-
-    // --- description from .arch file ---
-    // Write-exchange 2 qwords with a stride of 64 qwords.
-    void
-    Inst_DS__DS_WRXCHG2ST64_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_CMPST_RTN_B64 class methods ---
-
-    Inst_DS__DS_CMPST_RTN_B64::Inst_DS__DS_CMPST_RTN_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_rtn_b64")
-    {
-    } // Inst_DS__DS_CMPST_RTN_B64
-
-    Inst_DS__DS_CMPST_RTN_B64::~Inst_DS__DS_CMPST_RTN_B64()
-    {
-    } // ~Inst_DS__DS_CMPST_RTN_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    // Compare and store.
-    // Caution, the order of src and cmp are the *opposite* of the
-    // ---  BUFFER_ATOMIC_CMPSWAP_X2 opcode.
-    void
-    Inst_DS__DS_CMPST_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_CMPST_RTN_F64 class methods ---
-
-    Inst_DS__DS_CMPST_RTN_F64::Inst_DS__DS_CMPST_RTN_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_rtn_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_CMPST_RTN_F64
-
-    Inst_DS__DS_CMPST_RTN_F64::~Inst_DS__DS_CMPST_RTN_F64()
-    {
-    } // ~Inst_DS__DS_CMPST_RTN_F64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    // Floating point compare and store that handles NaN/INF/denormal values.
-    // Caution, the order of src and cmp are the *opposite* of the
-    // ---  BUFFER_ATOMIC_FCMPSWAP_X2 opcode.
-    void
-    Inst_DS__DS_CMPST_RTN_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_RTN_F64 class methods ---
-
-    Inst_DS__DS_MIN_RTN_F64::Inst_DS__DS_MIN_RTN_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_rtn_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_MIN_RTN_F64
-
-    Inst_DS__DS_MIN_RTN_F64::~Inst_DS__DS_MIN_RTN_F64()
-    {
-    } // ~Inst_DS__DS_MIN_RTN_F64
-
-    // --- description from .arch file ---
-    // 64b.
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
-    // Floating point minimum that handles NaN/INF/denormal values.
-    // Note that this opcode is slightly more general-purpose than
-    // ---  BUFFER_ATOMIC_FMIN_X2.
-    void
-    Inst_DS__DS_MIN_RTN_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_RTN_F64 class methods ---
-
-    Inst_DS__DS_MAX_RTN_F64::Inst_DS__DS_MAX_RTN_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_rtn_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_MAX_RTN_F64
-
-    Inst_DS__DS_MAX_RTN_F64::~Inst_DS__DS_MAX_RTN_F64()
-    {
-    } // ~Inst_DS__DS_MAX_RTN_F64
-
-    // --- description from .arch file ---
-    // 64b.
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
-    // Floating point maximum that handles NaN/INF/denormal values.
-    // Note that this opcode is slightly more general-purpose than
-    // ---  BUFFER_ATOMIC_FMAX_X2.
-    void
-    Inst_DS__DS_MAX_RTN_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_READ_B64 class methods ---
-
-    Inst_DS__DS_READ_B64::Inst_DS__DS_READ_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_b64")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_B64
-
-    Inst_DS__DS_READ_B64::~Inst_DS__DS_READ_B64()
-    {
-    } // ~Inst_DS__DS_READ_B64
-
-    // --- description from .arch file ---
-    // RETURN_DATA = MEM[ADDR].
-    // Read 1 qword.
-    void
-    Inst_DS__DS_READ_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemRead<VecElemU64>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ_B64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU64 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane];
-            }
-        }
-
-        vdst.write();
-    } // completeAcc
-    // --- Inst_DS__DS_READ2_B64 class methods ---
-
-    Inst_DS__DS_READ2_B64::Inst_DS__DS_READ2_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read2_b64")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ2_B64
-
-    Inst_DS__DS_READ2_B64::~Inst_DS__DS_READ2_B64()
-    {
-    } // ~Inst_DS__DS_READ2_B64
-
-    // --- description from .arch file ---
-    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 8];
-    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 8].
-    // Read 2 qwords.
-    void
-    Inst_DS__DS_READ2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ2_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0 * 8;
-        Addr offset1 = instData.OFFSET1 * 8;
-
-        initDualMemRead<VecElemU64>(gpuDynInst, offset0, offset1);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ2_B64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU64 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU64 vdst1(gpuDynInst, extData.VDST + 2);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2];
-                vdst1[lane] = (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2 + 1];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-    } // completeAcc
-    // --- Inst_DS__DS_READ2ST64_B64 class methods ---
-
-    Inst_DS__DS_READ2ST64_B64::Inst_DS__DS_READ2ST64_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read2st64_b64")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ2ST64_B64
-
-    Inst_DS__DS_READ2ST64_B64::~Inst_DS__DS_READ2ST64_B64()
-    {
-    } // ~Inst_DS__DS_READ2ST64_B64
-
-    // --- description from .arch file ---
-    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 8 * 64];
-    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 8 * 64].
-    // Read 2 qwords.
-    void
-    Inst_DS__DS_READ2ST64_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ2ST64_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = (instData.OFFSET0 * 8 * 64);
-        Addr offset1 = (instData.OFFSET1 * 8 * 64);
-
-        initDualMemRead<VecElemU64>(gpuDynInst, offset0, offset1);
-    }
-
-    void
-    Inst_DS__DS_READ2ST64_B64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU64 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU64 vdst1(gpuDynInst, extData.VDST + 2);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2];
-                vdst1[lane] = (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2 + 1];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-    }
-    // --- Inst_DS__DS_CONDXCHG32_RTN_B64 class methods ---
-
-    Inst_DS__DS_CONDXCHG32_RTN_B64::Inst_DS__DS_CONDXCHG32_RTN_B64(
-          InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_condxchg32_rtn_b64")
-    {
-    } // Inst_DS__DS_CONDXCHG32_RTN_B64
-
-    Inst_DS__DS_CONDXCHG32_RTN_B64::~Inst_DS__DS_CONDXCHG32_RTN_B64()
-    {
-    } // ~Inst_DS__DS_CONDXCHG32_RTN_B64
-
-    // --- description from .arch file ---
-    // Conditional write exchange.
-    void
-    Inst_DS__DS_CONDXCHG32_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_ADD_SRC2_U32 class methods ---
-
-    Inst_DS__DS_ADD_SRC2_U32::Inst_DS__DS_ADD_SRC2_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_src2_u32")
-    {
-    } // Inst_DS__DS_ADD_SRC2_U32
-
-    Inst_DS__DS_ADD_SRC2_U32::~Inst_DS__DS_ADD_SRC2_U32()
-    {
-    } // ~Inst_DS__DS_ADD_SRC2_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] + MEM[B].
-    void
-    Inst_DS__DS_ADD_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_SUB_SRC2_U32 class methods ---
-
-    Inst_DS__DS_SUB_SRC2_U32::Inst_DS__DS_SUB_SRC2_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_sub_src2_u32")
-    {
-    } // Inst_DS__DS_SUB_SRC2_U32
-
-    Inst_DS__DS_SUB_SRC2_U32::~Inst_DS__DS_SUB_SRC2_U32()
-    {
-    } // ~Inst_DS__DS_SUB_SRC2_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] - MEM[B].
-    void
-    Inst_DS__DS_SUB_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_RSUB_SRC2_U32 class methods ---
-
-    Inst_DS__DS_RSUB_SRC2_U32::Inst_DS__DS_RSUB_SRC2_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_rsub_src2_u32")
-    {
-    } // Inst_DS__DS_RSUB_SRC2_U32
-
-    Inst_DS__DS_RSUB_SRC2_U32::~Inst_DS__DS_RSUB_SRC2_U32()
-    {
-    } // ~Inst_DS__DS_RSUB_SRC2_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[B] - MEM[A].
-    void
-    Inst_DS__DS_RSUB_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_INC_SRC2_U32 class methods ---
-
-    Inst_DS__DS_INC_SRC2_U32::Inst_DS__DS_INC_SRC2_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_inc_src2_u32")
-    {
-    } // Inst_DS__DS_INC_SRC2_U32
-
-    Inst_DS__DS_INC_SRC2_U32::~Inst_DS__DS_INC_SRC2_U32()
-    {
-    } // ~Inst_DS__DS_INC_SRC2_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[A] >= MEM[B] ? 0 : MEM[A] + 1).
-    void
-    Inst_DS__DS_INC_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_DEC_SRC2_U32 class methods ---
-
-    Inst_DS__DS_DEC_SRC2_U32::Inst_DS__DS_DEC_SRC2_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_dec_src2_u32")
-    {
-    } // Inst_DS__DS_DEC_SRC2_U32
-
-    Inst_DS__DS_DEC_SRC2_U32::~Inst_DS__DS_DEC_SRC2_U32()
-    {
-    } // ~Inst_DS__DS_DEC_SRC2_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[A] == 0 || MEM[A] > MEM[B] ? MEM[B] : MEM[A] - 1).
-    // Uint decrement.
-    void
-    Inst_DS__DS_DEC_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_SRC2_I32 class methods ---
-
-    Inst_DS__DS_MIN_SRC2_I32::Inst_DS__DS_MIN_SRC2_I32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_src2_i32")
-    {
-    } // Inst_DS__DS_MIN_SRC2_I32
-
-    Inst_DS__DS_MIN_SRC2_I32::~Inst_DS__DS_MIN_SRC2_I32()
-    {
-    } // ~Inst_DS__DS_MIN_SRC2_I32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = min(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MIN_SRC2_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_SRC2_I32 class methods ---
-
-    Inst_DS__DS_MAX_SRC2_I32::Inst_DS__DS_MAX_SRC2_I32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_src2_i32")
-    {
-    } // Inst_DS__DS_MAX_SRC2_I32
-
-    Inst_DS__DS_MAX_SRC2_I32::~Inst_DS__DS_MAX_SRC2_I32()
-    {
-    } // ~Inst_DS__DS_MAX_SRC2_I32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = max(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MAX_SRC2_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_SRC2_U32 class methods ---
-
-    Inst_DS__DS_MIN_SRC2_U32::Inst_DS__DS_MIN_SRC2_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_src2_u32")
-    {
-    } // Inst_DS__DS_MIN_SRC2_U32
-
-    Inst_DS__DS_MIN_SRC2_U32::~Inst_DS__DS_MIN_SRC2_U32()
-    {
-    } // ~Inst_DS__DS_MIN_SRC2_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = min(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MIN_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_SRC2_U32 class methods ---
-
-    Inst_DS__DS_MAX_SRC2_U32::Inst_DS__DS_MAX_SRC2_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_src2_u32")
-    {
-    } // Inst_DS__DS_MAX_SRC2_U32
-
-    Inst_DS__DS_MAX_SRC2_U32::~Inst_DS__DS_MAX_SRC2_U32()
-    {
-    } // ~Inst_DS__DS_MAX_SRC2_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = max(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MAX_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_AND_SRC2_B32 class methods ---
-
-    Inst_DS__DS_AND_SRC2_B32::Inst_DS__DS_AND_SRC2_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_and_src2_b32")
-    {
-    } // Inst_DS__DS_AND_SRC2_B32
-
-    Inst_DS__DS_AND_SRC2_B32::~Inst_DS__DS_AND_SRC2_B32()
-    {
-    } // ~Inst_DS__DS_AND_SRC2_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] & MEM[B].
-    void
-    Inst_DS__DS_AND_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_OR_SRC2_B32 class methods ---
-
-    Inst_DS__DS_OR_SRC2_B32::Inst_DS__DS_OR_SRC2_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_or_src2_b32")
-    {
-    } // Inst_DS__DS_OR_SRC2_B32
-
-    Inst_DS__DS_OR_SRC2_B32::~Inst_DS__DS_OR_SRC2_B32()
-    {
-    } // ~Inst_DS__DS_OR_SRC2_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] | MEM[B].
-    void
-    Inst_DS__DS_OR_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_XOR_SRC2_B32 class methods ---
-
-    Inst_DS__DS_XOR_SRC2_B32::Inst_DS__DS_XOR_SRC2_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_xor_src2_b32")
-    {
-    } // Inst_DS__DS_XOR_SRC2_B32
-
-    Inst_DS__DS_XOR_SRC2_B32::~Inst_DS__DS_XOR_SRC2_B32()
-    {
-    } // ~Inst_DS__DS_XOR_SRC2_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] ^ MEM[B].
-    void
-    Inst_DS__DS_XOR_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRITE_SRC2_B32 class methods ---
-
-    Inst_DS__DS_WRITE_SRC2_B32::Inst_DS__DS_WRITE_SRC2_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_src2_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_SRC2_B32
-
-    Inst_DS__DS_WRITE_SRC2_B32::~Inst_DS__DS_WRITE_SRC2_B32()
-    {
-    } // ~Inst_DS__DS_WRITE_SRC2_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[B].
-    // Write dword.
-    void
-    Inst_DS__DS_WRITE_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_SRC2_F32 class methods ---
-
-    Inst_DS__DS_MIN_SRC2_F32::Inst_DS__DS_MIN_SRC2_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_src2_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_MIN_SRC2_F32
-
-    Inst_DS__DS_MIN_SRC2_F32::~Inst_DS__DS_MIN_SRC2_F32()
-    {
-    } // ~Inst_DS__DS_MIN_SRC2_F32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[B] < MEM[A]) ? MEM[B] : MEM[A].
-    // Float, handles NaN/INF/denorm.
-    void
-    Inst_DS__DS_MIN_SRC2_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_SRC2_F32 class methods ---
-
-    Inst_DS__DS_MAX_SRC2_F32::Inst_DS__DS_MAX_SRC2_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_src2_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_MAX_SRC2_F32
-
-    Inst_DS__DS_MAX_SRC2_F32::~Inst_DS__DS_MAX_SRC2_F32()
-    {
-    } // ~Inst_DS__DS_MAX_SRC2_F32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[B] > MEM[A]) ? MEM[B] : MEM[A].
-    // Float, handles NaN/INF/denorm.
-    void
-    Inst_DS__DS_MAX_SRC2_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_ADD_SRC2_F32 class methods ---
-
-    Inst_DS__DS_ADD_SRC2_F32::Inst_DS__DS_ADD_SRC2_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_src2_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_ADD_SRC2_F32
-
-    Inst_DS__DS_ADD_SRC2_F32::~Inst_DS__DS_ADD_SRC2_F32()
-    {
-    } // ~Inst_DS__DS_ADD_SRC2_F32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[B] + MEM[A].
-    // Float, handles NaN/INF/denorm.
-    void
-    Inst_DS__DS_ADD_SRC2_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_GWS_SEMA_RELEASE_ALL class methods ---
-
-    Inst_DS__DS_GWS_SEMA_RELEASE_ALL::Inst_DS__DS_GWS_SEMA_RELEASE_ALL(
-          InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_gws_sema_release_all")
-    {
-    } // Inst_DS__DS_GWS_SEMA_RELEASE_ALL
-
-    Inst_DS__DS_GWS_SEMA_RELEASE_ALL::~Inst_DS__DS_GWS_SEMA_RELEASE_ALL()
-    {
-    } // ~Inst_DS__DS_GWS_SEMA_RELEASE_ALL
-
-    // --- description from .arch file ---
-    // GDS Only: The GWS resource (rid) indicated will process this opcode by
-    // updating the counter and labeling the specified resource as a semaphore.
-    // //Determine the GWS resource to work on
-    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
-    // //Incr the state counter of the resource
-    // state.counter[rid] = state.wave_in_queue;
-    // state.type = SEMAPHORE;
-    // return rd_done; //release calling wave
-    // This action will release ALL queued waves; it Will have no effect if no
-    // ---  waves are present.
-    void
-    Inst_DS__DS_GWS_SEMA_RELEASE_ALL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_GWS_INIT class methods ---
-
-    Inst_DS__DS_GWS_INIT::Inst_DS__DS_GWS_INIT(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_gws_init")
-    {
-    } // Inst_DS__DS_GWS_INIT
-
-    Inst_DS__DS_GWS_INIT::~Inst_DS__DS_GWS_INIT()
-    {
-    } // ~Inst_DS__DS_GWS_INIT
-
-    // --- description from .arch file ---
-    // GDS Only: Initialize a barrier or semaphore resource.
-    // //Determine the GWS resource to work on
-    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
-    // //Get the value to use in init
-    // index = find_first_valid(vector mask)
-    // value = DATA[thread: index]
-    // //Set the state of the resource
-    // state.counter[rid] = lsb(value); //limit #waves
-    // state.flag[rid] = 0;
-    // return rd_done; //release calling wave
-    void
-    Inst_DS__DS_GWS_INIT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_GWS_SEMA_V class methods ---
-
-    Inst_DS__DS_GWS_SEMA_V::Inst_DS__DS_GWS_SEMA_V(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_gws_sema_v")
-    {
-    } // Inst_DS__DS_GWS_SEMA_V
-
-    Inst_DS__DS_GWS_SEMA_V::~Inst_DS__DS_GWS_SEMA_V()
-    {
-    } // ~Inst_DS__DS_GWS_SEMA_V
-
-    // --- description from .arch file ---
-    // GDS Only: The GWS resource indicated will process this opcode by
-    // updating the counter and labeling the resource as a semaphore.
-    // //Determine the GWS resource to work on
-    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
-    // //Incr the state counter of the resource
-    // state.counter[rid]++;
-    // state.type = SEMAPHORE;
-    // return rd_done; //release calling wave
-    // This action will release one waved if any are queued in this resource.
-    void
-    Inst_DS__DS_GWS_SEMA_V::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_GWS_SEMA_BR class methods ---
-
-    Inst_DS__DS_GWS_SEMA_BR::Inst_DS__DS_GWS_SEMA_BR(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_gws_sema_br")
-    {
-    } // Inst_DS__DS_GWS_SEMA_BR
-
-    Inst_DS__DS_GWS_SEMA_BR::~Inst_DS__DS_GWS_SEMA_BR()
-    {
-    } // ~Inst_DS__DS_GWS_SEMA_BR
-
-    // --- description from .arch file ---
-    // GDS Only: The GWS resource indicated will process this opcode by
-    // updating the counter by the bulk release delivered count and labeling
-    // the resource as a semaphore.
-    // //Determine the GWS resource to work on
-    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
-    // index =  find first valid (vector mask)
-    // count = DATA[thread: index];
-    // //Add count to the resource state counter
-    // state.counter[rid] += count;
-    // state.type = SEMAPHORE;
-    // return rd_done; //release calling wave
-    // This action will release count number of waves, immediately if queued,
-    // or as they arrive from the noted resource.
-    void
-    Inst_DS__DS_GWS_SEMA_BR::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_GWS_SEMA_P class methods ---
-
-    Inst_DS__DS_GWS_SEMA_P::Inst_DS__DS_GWS_SEMA_P(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_gws_sema_p")
-    {
-    } // Inst_DS__DS_GWS_SEMA_P
-
-    Inst_DS__DS_GWS_SEMA_P::~Inst_DS__DS_GWS_SEMA_P()
-    {
-    } // ~Inst_DS__DS_GWS_SEMA_P
-
-    // --- description from .arch file ---
-    // GDS Only: The GWS resource indicated will process this opcode by
-    // queueing it until counter enables a release and then decrementing the
-    // counter of the resource as a semaphore.
-    // //Determine the GWS resource to work on
-    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
-    // state.type = SEMAPHORE;
-    // ENQUEUE until(state[rid].counter > 0)
-    // state[rid].counter--;
-    // return rd_done
-    void
-    Inst_DS__DS_GWS_SEMA_P::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_GWS_BARRIER class methods ---
-
-    Inst_DS__DS_GWS_BARRIER::Inst_DS__DS_GWS_BARRIER(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_gws_barrier")
-    {
-    } // Inst_DS__DS_GWS_BARRIER
-
-    Inst_DS__DS_GWS_BARRIER::~Inst_DS__DS_GWS_BARRIER()
-    {
-    } // ~Inst_DS__DS_GWS_BARRIER
-
-    // --- description from .arch file ---
-    // GDS Only: The GWS resource indicated will process this opcode by
-    // queueing it until barrier is satisfied. The number of waves needed is
-    // passed in as DATA of first valid thread.
-    // //Determine the GWS resource to work on
-    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + OFFSET0[5:0];
-    // index =  find first valid (vector mask);
-    // value = DATA[thread: index];
-    // // Input Decision Machine
-    // state.type[rid] = BARRIER;
-    // if(state[rid].counter <= 0) {
-    //     thread[rid].flag = state[rid].flag;
-    //     ENQUEUE;
-    //     state[rid].flag = !state.flag;
-    //     state[rid].counter = value;
-    //     return rd_done;
-    // } else {
-    //     state[rid].counter--;
-    //     thread.flag = state[rid].flag;
-    //     ENQUEUE;
-    // }
-    // Since the waves deliver the count for the next barrier, this function
-    // can have a different size barrier for each occurrence.
-    // // Release Machine
-    // if(state.type == BARRIER) {
-    //     if(state.flag != thread.flag) {
-    //         return rd_done;
-    //     }
-    // }
-    void
-    Inst_DS__DS_GWS_BARRIER::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_CONSUME class methods ---
-
-    Inst_DS__DS_CONSUME::Inst_DS__DS_CONSUME(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_consume")
-    {
-    } // Inst_DS__DS_CONSUME
-
-    Inst_DS__DS_CONSUME::~Inst_DS__DS_CONSUME()
-    {
-    } // ~Inst_DS__DS_CONSUME
-
-    // --- description from .arch file ---
-    // LDS & GDS. Subtract (count_bits(exec_mask)) from the value stored in DS
-    // memory at (M0.base + instr_offset). Return the pre-operation value to
-    // VGPRs.
-    void
-    Inst_DS__DS_CONSUME::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_APPEND class methods ---
-
-    Inst_DS__DS_APPEND::Inst_DS__DS_APPEND(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_append")
-    {
-    } // Inst_DS__DS_APPEND
-
-    Inst_DS__DS_APPEND::~Inst_DS__DS_APPEND()
-    {
-    } // ~Inst_DS__DS_APPEND
-
-    // --- description from .arch file ---
-    // LDS & GDS. Add (count_bits(exec_mask)) to the value stored in DS memory
-    // at (M0.base + instr_offset). Return the pre-operation value to VGPRs.
-    void
-    Inst_DS__DS_APPEND::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_ORDERED_COUNT class methods ---
-
-    Inst_DS__DS_ORDERED_COUNT::Inst_DS__DS_ORDERED_COUNT(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_ordered_count")
-    {
-    } // Inst_DS__DS_ORDERED_COUNT
-
-    Inst_DS__DS_ORDERED_COUNT::~Inst_DS__DS_ORDERED_COUNT()
-    {
-    } // ~Inst_DS__DS_ORDERED_COUNT
-
-    // --- description from .arch file ---
-    // GDS-only. Add (count_bits(exec_mask)) to one of 4 dedicated
-    // ordered-count counters (aka 'packers'). Additional bits of instr.offset
-    // field are overloaded to hold packer-id, 'last'.
-    void
-    Inst_DS__DS_ORDERED_COUNT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_ADD_SRC2_U64 class methods ---
-
-    Inst_DS__DS_ADD_SRC2_U64::Inst_DS__DS_ADD_SRC2_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_src2_u64")
-    {
-    } // Inst_DS__DS_ADD_SRC2_U64
-
-    Inst_DS__DS_ADD_SRC2_U64::~Inst_DS__DS_ADD_SRC2_U64()
-    {
-    } // ~Inst_DS__DS_ADD_SRC2_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] + MEM[B].
-    void
-    Inst_DS__DS_ADD_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_SUB_SRC2_U64 class methods ---
-
-    Inst_DS__DS_SUB_SRC2_U64::Inst_DS__DS_SUB_SRC2_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_sub_src2_u64")
-    {
-    } // Inst_DS__DS_SUB_SRC2_U64
-
-    Inst_DS__DS_SUB_SRC2_U64::~Inst_DS__DS_SUB_SRC2_U64()
-    {
-    } // ~Inst_DS__DS_SUB_SRC2_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] - MEM[B].
-    void
-    Inst_DS__DS_SUB_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_RSUB_SRC2_U64 class methods ---
-
-    Inst_DS__DS_RSUB_SRC2_U64::Inst_DS__DS_RSUB_SRC2_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_rsub_src2_u64")
-    {
-    } // Inst_DS__DS_RSUB_SRC2_U64
-
-    Inst_DS__DS_RSUB_SRC2_U64::~Inst_DS__DS_RSUB_SRC2_U64()
-    {
-    } // ~Inst_DS__DS_RSUB_SRC2_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[B] - MEM[A].
-    void
-    Inst_DS__DS_RSUB_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_INC_SRC2_U64 class methods ---
-
-    Inst_DS__DS_INC_SRC2_U64::Inst_DS__DS_INC_SRC2_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_inc_src2_u64")
-    {
-    } // Inst_DS__DS_INC_SRC2_U64
-
-    Inst_DS__DS_INC_SRC2_U64::~Inst_DS__DS_INC_SRC2_U64()
-    {
-    } // ~Inst_DS__DS_INC_SRC2_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[A] >= MEM[B] ? 0 : MEM[A] + 1).
-    void
-    Inst_DS__DS_INC_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_DEC_SRC2_U64 class methods ---
-
-    Inst_DS__DS_DEC_SRC2_U64::Inst_DS__DS_DEC_SRC2_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_dec_src2_u64")
-    {
-    } // Inst_DS__DS_DEC_SRC2_U64
-
-    Inst_DS__DS_DEC_SRC2_U64::~Inst_DS__DS_DEC_SRC2_U64()
-    {
-    } // ~Inst_DS__DS_DEC_SRC2_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[A] == 0 || MEM[A] > MEM[B] ? MEM[B] : MEM[A] - 1).
-    // Uint decrement.
-    void
-    Inst_DS__DS_DEC_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_SRC2_I64 class methods ---
-
-    Inst_DS__DS_MIN_SRC2_I64::Inst_DS__DS_MIN_SRC2_I64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_src2_i64")
-    {
-    } // Inst_DS__DS_MIN_SRC2_I64
-
-    Inst_DS__DS_MIN_SRC2_I64::~Inst_DS__DS_MIN_SRC2_I64()
-    {
-    } // ~Inst_DS__DS_MIN_SRC2_I64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = min(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MIN_SRC2_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_SRC2_I64 class methods ---
-
-    Inst_DS__DS_MAX_SRC2_I64::Inst_DS__DS_MAX_SRC2_I64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_src2_i64")
-    {
-    } // Inst_DS__DS_MAX_SRC2_I64
-
-    Inst_DS__DS_MAX_SRC2_I64::~Inst_DS__DS_MAX_SRC2_I64()
-    {
-    } // ~Inst_DS__DS_MAX_SRC2_I64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = max(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MAX_SRC2_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_SRC2_U64 class methods ---
-
-    Inst_DS__DS_MIN_SRC2_U64::Inst_DS__DS_MIN_SRC2_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_src2_u64")
-    {
-    } // Inst_DS__DS_MIN_SRC2_U64
-
-    Inst_DS__DS_MIN_SRC2_U64::~Inst_DS__DS_MIN_SRC2_U64()
-    {
-    } // ~Inst_DS__DS_MIN_SRC2_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = min(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MIN_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_SRC2_U64 class methods ---
-
-    Inst_DS__DS_MAX_SRC2_U64::Inst_DS__DS_MAX_SRC2_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_src2_u64")
-    {
-    } // Inst_DS__DS_MAX_SRC2_U64
-
-    Inst_DS__DS_MAX_SRC2_U64::~Inst_DS__DS_MAX_SRC2_U64()
-    {
-    } // ~Inst_DS__DS_MAX_SRC2_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = max(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MAX_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_AND_SRC2_B64 class methods ---
-
-    Inst_DS__DS_AND_SRC2_B64::Inst_DS__DS_AND_SRC2_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_and_src2_b64")
-    {
-    } // Inst_DS__DS_AND_SRC2_B64
-
-    Inst_DS__DS_AND_SRC2_B64::~Inst_DS__DS_AND_SRC2_B64()
-    {
-    } // ~Inst_DS__DS_AND_SRC2_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] & MEM[B].
-    void
-    Inst_DS__DS_AND_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_OR_SRC2_B64 class methods ---
-
-    Inst_DS__DS_OR_SRC2_B64::Inst_DS__DS_OR_SRC2_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_or_src2_b64")
-    {
-    } // Inst_DS__DS_OR_SRC2_B64
-
-    Inst_DS__DS_OR_SRC2_B64::~Inst_DS__DS_OR_SRC2_B64()
-    {
-    } // ~Inst_DS__DS_OR_SRC2_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] | MEM[B].
-    void
-    Inst_DS__DS_OR_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_XOR_SRC2_B64 class methods ---
-
-    Inst_DS__DS_XOR_SRC2_B64::Inst_DS__DS_XOR_SRC2_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_xor_src2_b64")
-    {
-    } // Inst_DS__DS_XOR_SRC2_B64
-
-    Inst_DS__DS_XOR_SRC2_B64::~Inst_DS__DS_XOR_SRC2_B64()
-    {
-    } // ~Inst_DS__DS_XOR_SRC2_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] ^ MEM[B].
-    void
-    Inst_DS__DS_XOR_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRITE_SRC2_B64 class methods ---
-
-    Inst_DS__DS_WRITE_SRC2_B64::Inst_DS__DS_WRITE_SRC2_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_src2_b64")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_SRC2_B64
-
-    Inst_DS__DS_WRITE_SRC2_B64::~Inst_DS__DS_WRITE_SRC2_B64()
-    {
-    } // ~Inst_DS__DS_WRITE_SRC2_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[B].
-    // Write qword.
-    void
-    Inst_DS__DS_WRITE_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_SRC2_F64 class methods ---
-
-    Inst_DS__DS_MIN_SRC2_F64::Inst_DS__DS_MIN_SRC2_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_src2_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_MIN_SRC2_F64
-
-    Inst_DS__DS_MIN_SRC2_F64::~Inst_DS__DS_MIN_SRC2_F64()
-    {
-    } // ~Inst_DS__DS_MIN_SRC2_F64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[B] < MEM[A]) ? MEM[B] : MEM[A].
-    // Float, handles NaN/INF/denorm.
-    void
-    Inst_DS__DS_MIN_SRC2_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_SRC2_F64 class methods ---
-
-    Inst_DS__DS_MAX_SRC2_F64::Inst_DS__DS_MAX_SRC2_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_src2_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_MAX_SRC2_F64
-
-    Inst_DS__DS_MAX_SRC2_F64::~Inst_DS__DS_MAX_SRC2_F64()
-    {
-    } // ~Inst_DS__DS_MAX_SRC2_F64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[B] > MEM[A]) ? MEM[B] : MEM[A].
-    // Float, handles NaN/INF/denorm.
-    void
-    Inst_DS__DS_MAX_SRC2_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRITE_B96 class methods ---
-
-    Inst_DS__DS_WRITE_B96::Inst_DS__DS_WRITE_B96(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_b96")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_B96
-
-    Inst_DS__DS_WRITE_B96::~Inst_DS__DS_WRITE_B96()
-    {
-    } // ~Inst_DS__DS_WRITE_B96
-
-    // --- description from .arch file ---
-    // {MEM[ADDR + 8], MEM[ADDR + 4], MEM[ADDR]} = DATA[95:0].
-    // Tri-dword write.
-    void
-    Inst_DS__DS_WRITE_B96::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
-        ConstVecOperandU32 data1(gpuDynInst, extData.DATA0 + 1);
-        ConstVecOperandU32 data2(gpuDynInst, extData.DATA0 + 2);
-
-        addr.read();
-        data0.read();
-        data1.read();
-        data2.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4] = data0[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE_B96::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemWrite<3>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_WRITE_B96::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_WRITE_B128 class methods ---
-
-    Inst_DS__DS_WRITE_B128::Inst_DS__DS_WRITE_B128(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_b128")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_B128
-
-    Inst_DS__DS_WRITE_B128::~Inst_DS__DS_WRITE_B128()
-    {
-    } // ~Inst_DS__DS_WRITE_B128
-
-    // --- description from .arch file ---
-    // {MEM[ADDR + 12], MEM[ADDR + 8], MEM[ADDR + 4], MEM[ADDR]} = DATA[127:0].
-    // Qword write.
-    void
-    Inst_DS__DS_WRITE_B128::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
-        ConstVecOperandU32 data1(gpuDynInst, extData.DATA0 + 1);
-        ConstVecOperandU32 data2(gpuDynInst, extData.DATA0 + 2);
-        ConstVecOperandU32 data3(gpuDynInst, extData.DATA0 + 3);
-
-        addr.read();
-        data0.read();
-        data1.read();
-        data2.read();
-        data3.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4] = data0[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 3] = data3[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE_B128::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemWrite<4>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_WRITE_B128::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_READ_B96 class methods ---
-
-    Inst_DS__DS_READ_B96::Inst_DS__DS_READ_B96(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_b96")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_B96
-
-    Inst_DS__DS_READ_B96::~Inst_DS__DS_READ_B96()
-    {
-    } // ~Inst_DS__DS_READ_B96
-
-    // --- description from .arch file ---
-    // Tri-dword read.
-    void
-    Inst_DS__DS_READ_B96::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ_B96::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemRead<3>(gpuDynInst, offset);
-    }
-
-    void
-    Inst_DS__DS_READ_B96::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
-        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4];
-                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 1];
-                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 2];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-        vdst2.write();
-    }
-    // --- Inst_DS__DS_READ_B128 class methods ---
-
-    Inst_DS__DS_READ_B128::Inst_DS__DS_READ_B128(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_b128")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_B128
-
-    Inst_DS__DS_READ_B128::~Inst_DS__DS_READ_B128()
-    {
-    } // ~Inst_DS__DS_READ_B128
-
-    // --- description from .arch file ---
-    // Qword read.
-    void
-    Inst_DS__DS_READ_B128::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ_B128::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemRead<4>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ_B128::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
-        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
-        VecOperandU32 vdst3(gpuDynInst, extData.VDST + 3);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4];
-                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 1];
-                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 2];
-                vdst3[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 3];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-        vdst2.write();
-        vdst3.write();
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_X class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_X
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_X(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_x")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_X
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::~Inst_MUBUF__BUFFER_LOAD_FORMAT_X()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_X
-
-    // --- description from .arch file ---
-    // Untyped buffer load 1 dword with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_XY class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_XY(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_XY
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::~Inst_MUBUF__BUFFER_LOAD_FORMAT_XY()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XY
-
-    // --- description from .arch file ---
-    // Untyped buffer load 2 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ
-
-    // --- description from .arch file ---
-    // Untyped buffer load 3 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW
-
-    // --- description from .arch file ---
-    // Untyped buffer load 4 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_X class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_X
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_X(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_x")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_X
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_X::~Inst_MUBUF__BUFFER_STORE_FORMAT_X()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_X
-
-    // --- description from .arch file ---
-    // Untyped buffer store 1 dword with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_XY class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XY
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_XY(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_XY
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::~Inst_MUBUF__BUFFER_STORE_FORMAT_XY()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_XY
-
-    // --- description from .arch file ---
-    // Untyped buffer store 2 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ
-
-    // --- description from .arch file ---
-    // Untyped buffer store 3 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
-        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
-
-    // --- description from .arch file ---
-    // Untyped buffer store 4 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_d16_x")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
-        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
-
-    // --- description from .arch file ---
-    // Untyped buffer load 1 dword with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_d16_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
-        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
-
-    // --- description from .arch file ---
-    // Untyped buffer load 2 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_d16_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
-        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
-
-    // --- description from .arch file ---
-    // Untyped buffer load 3 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_d16_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
-        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
-
-    // --- description from .arch file ---
-    // Untyped buffer load 4 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_d16_x")
-    {
-        setFlag(Store);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
-        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
-
-    // --- description from .arch file ---
-    // Untyped buffer store 1 dword with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_d16_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
-        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
-
-    // --- description from .arch file ---
-    // Untyped buffer store 2 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_d16_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
-        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
-
-    // --- description from .arch file ---
-    // Untyped buffer store 3 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_d16_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
-        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
-
-    // --- description from .arch file ---
-    // Untyped buffer store 4 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_UBYTE class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_UBYTE
-        ::Inst_MUBUF__BUFFER_LOAD_UBYTE(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_ubyte")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_LOAD_UBYTE
-
-    Inst_MUBUF__BUFFER_LOAD_UBYTE::~Inst_MUBUF__BUFFER_LOAD_UBYTE()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_UBYTE
-
-    // --- description from .arch file ---
-    // Untyped buffer load unsigned byte (zero extend to VGPR destination).
-    void
-    Inst_MUBUF__BUFFER_LOAD_UBYTE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-
-        rsrcDesc.read();
-        offset.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_UBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<VecElemU8>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_UBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDATA);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                if (!oobMask[lane]) {
-                    vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU8*>(
-                        gpuDynInst->d_data))[lane]);
-                } else {
-                    vdst[lane] = 0;
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-
-    // --- Inst_MUBUF__BUFFER_LOAD_SBYTE class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_SBYTE
-        ::Inst_MUBUF__BUFFER_LOAD_SBYTE(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_sbyte")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_SBYTE
-
-    Inst_MUBUF__BUFFER_LOAD_SBYTE::~Inst_MUBUF__BUFFER_LOAD_SBYTE()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_SBYTE
-
-    // --- description from .arch file ---
-    // Untyped buffer load signed byte (sign extend to VGPR destination).
-    void
-    Inst_MUBUF__BUFFER_LOAD_SBYTE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_SBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_SBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_USHORT class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_USHORT
-        ::Inst_MUBUF__BUFFER_LOAD_USHORT(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_ushort")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_LOAD_USHORT
-
-    Inst_MUBUF__BUFFER_LOAD_USHORT::~Inst_MUBUF__BUFFER_LOAD_USHORT()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_USHORT
-
-    // --- description from .arch file ---
-    // Untyped buffer load unsigned short (zero extend to VGPR destination).
-    void
-    Inst_MUBUF__BUFFER_LOAD_USHORT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-
-        rsrcDesc.read();
-        offset.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_USHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<VecElemU16>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_USHORT::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDATA);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                if (!oobMask[lane]) {
-                    vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU16*>(
-                        gpuDynInst->d_data))[lane]);
-                } else {
-                    vdst[lane] = 0;
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-
-    // --- Inst_MUBUF__BUFFER_LOAD_SSHORT class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_SSHORT
-        ::Inst_MUBUF__BUFFER_LOAD_SSHORT(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_sshort")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_SSHORT
-
-    Inst_MUBUF__BUFFER_LOAD_SSHORT::~Inst_MUBUF__BUFFER_LOAD_SSHORT()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_SSHORT
-
-    // --- description from .arch file ---
-    // Untyped buffer load signed short (sign extend to VGPR destination).
-    void
-    Inst_MUBUF__BUFFER_LOAD_SSHORT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_SSHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_SSHORT::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_DWORD class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_DWORD
-        ::Inst_MUBUF__BUFFER_LOAD_DWORD(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_LOAD_DWORD
-
-    Inst_MUBUF__BUFFER_LOAD_DWORD::~Inst_MUBUF__BUFFER_LOAD_DWORD()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_DWORD
-
-    // --- description from .arch file ---
-    // Untyped buffer load dword.
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-
-        rsrcDesc.read();
-        offset.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDATA);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                if (!oobMask[lane]) {
-                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane];
-                } else {
-                    vdst[lane] = 0;
-                }
-            }
-        }
-
-        vdst.write();
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_LOAD_DWORDX2 class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_DWORDX2
-        ::Inst_MUBUF__BUFFER_LOAD_DWORDX2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_LOAD_DWORDX2
-
-    Inst_MUBUF__BUFFER_LOAD_DWORDX2::~Inst_MUBUF__BUFFER_LOAD_DWORDX2()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_DWORDX2
-
-    // --- description from .arch file ---
-    // Untyped buffer load 2 dwords.
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-
-        rsrcDesc.read();
-        offset.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<2>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                if (!oobMask[lane]) {
-                    vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 2];
-                    vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 2 + 1];
-                } else {
-                    vdst0[lane] = 0;
-                    vdst1[lane] = 0;
-                }
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_LOAD_DWORDX3 class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_DWORDX3
-        ::Inst_MUBUF__BUFFER_LOAD_DWORDX3(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_dwordx3")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_LOAD_DWORDX3
-
-    Inst_MUBUF__BUFFER_LOAD_DWORDX3::~Inst_MUBUF__BUFFER_LOAD_DWORDX3()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_DWORDX3
-
-    // --- description from .arch file ---
-    // Untyped buffer load 3 dwords.
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-
-        rsrcDesc.read();
-        offset.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<3>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
-        VecOperandU32 vdst2(gpuDynInst, extData.VDATA + 2);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                if (!oobMask[lane]) {
-                    vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 3];
-                    vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 3 + 1];
-                    vdst2[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 3 + 2];
-                } else {
-                    vdst0[lane] = 0;
-                    vdst1[lane] = 0;
-                    vdst2[lane] = 0;
-                }
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-        vdst2.write();
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_LOAD_DWORDX4 class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_DWORDX4
-        ::Inst_MUBUF__BUFFER_LOAD_DWORDX4(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_LOAD_DWORDX4
-
-    Inst_MUBUF__BUFFER_LOAD_DWORDX4::~Inst_MUBUF__BUFFER_LOAD_DWORDX4()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_DWORDX4
-
-    // --- description from .arch file ---
-    // Untyped buffer load 4 dwords.
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-
-        rsrcDesc.read();
-        offset.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<4>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
-        VecOperandU32 vdst2(gpuDynInst, extData.VDATA + 2);
-        VecOperandU32 vdst3(gpuDynInst, extData.VDATA + 3);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                if (!oobMask[lane]) {
-                    vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 4];
-                    vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 4 + 1];
-                    vdst2[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 4 + 2];
-                    vdst3[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 4 + 3];
-                } else {
-                    vdst0[lane] = 0;
-                    vdst1[lane] = 0;
-                    vdst2[lane] = 0;
-                    vdst3[lane] = 0;
-                }
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-        vdst2.write();
-        vdst3.write();
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_STORE_BYTE class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_BYTE
-        ::Inst_MUBUF__BUFFER_STORE_BYTE(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_byte")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_STORE_BYTE
-
-    Inst_MUBUF__BUFFER_STORE_BYTE::~Inst_MUBUF__BUFFER_STORE_BYTE()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_BYTE
-
-    // --- description from .arch file ---
-    // Untyped buffer store byte.
-    void
-    Inst_MUBUF__BUFFER_STORE_BYTE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-        ConstVecOperandI8 data(gpuDynInst, extData.VDATA);
-
-        rsrcDesc.read();
-        offset.read();
-        data.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-       gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemI8*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_BYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemI8>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_BYTE::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_SHORT class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_SHORT
-        ::Inst_MUBUF__BUFFER_STORE_SHORT(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_short")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_STORE_SHORT
-
-    Inst_MUBUF__BUFFER_STORE_SHORT::~Inst_MUBUF__BUFFER_STORE_SHORT()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_SHORT
-
-    // --- description from .arch file ---
-    // Untyped buffer store short.
-    void
-    Inst_MUBUF__BUFFER_STORE_SHORT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-        ConstVecOperandI16 data(gpuDynInst, extData.VDATA);
-
-        rsrcDesc.read();
-        offset.read();
-        data.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemI16*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_SHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemI16>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_SHORT::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_DWORD class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_DWORD::
-        Inst_MUBUF__BUFFER_STORE_DWORD(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_STORE_DWORD
-
-    Inst_MUBUF__BUFFER_STORE_DWORD::~Inst_MUBUF__BUFFER_STORE_DWORD()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_DWORD
-
-    // --- description from .arch file ---
-    // Untyped buffer store dword.
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-        ConstVecOperandU32 data(gpuDynInst, extData.VDATA);
-
-        rsrcDesc.read();
-        offset.read();
-        data.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_STORE_DWORDX2 class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_DWORDX2
-        ::Inst_MUBUF__BUFFER_STORE_DWORDX2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_STORE_DWORDX2
-
-    Inst_MUBUF__BUFFER_STORE_DWORDX2::~Inst_MUBUF__BUFFER_STORE_DWORDX2()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_DWORDX2
-
-    // --- description from .arch file ---
-    // Untyped buffer store 2 dwords.
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-        ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
-        ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
-
-        rsrcDesc.read();
-        offset.read();
-        data0.read();
-        data1.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
-                    = data0[lane];
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
-                    = data1[lane];
-            }
-        }
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<2>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_STORE_DWORDX3 class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_DWORDX3
-        ::Inst_MUBUF__BUFFER_STORE_DWORDX3(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_dwordx3")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_STORE_DWORDX3
-
-    Inst_MUBUF__BUFFER_STORE_DWORDX3::~Inst_MUBUF__BUFFER_STORE_DWORDX3()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_DWORDX3
-
-    // --- description from .arch file ---
-    // Untyped buffer store 3 dwords.
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-        ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
-        ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
-        ConstVecOperandU32 data2(gpuDynInst, extData.VDATA + 2);
-
-        rsrcDesc.read();
-        offset.read();
-        data0.read();
-        data1.read();
-        data2.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
-                    = data0[lane];
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
-                    = data1[lane];
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 2]
-                    = data2[lane];
-            }
-        }
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<3>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_STORE_DWORDX4 class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_DWORDX4
-        ::Inst_MUBUF__BUFFER_STORE_DWORDX4(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_STORE_DWORDX4
-
-    Inst_MUBUF__BUFFER_STORE_DWORDX4::~Inst_MUBUF__BUFFER_STORE_DWORDX4()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_DWORDX4
-
-    // --- description from .arch file ---
-    // Untyped buffer store 4 dwords.
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-        ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
-        ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
-        ConstVecOperandU32 data2(gpuDynInst, extData.VDATA + 2);
-        ConstVecOperandU32 data3(gpuDynInst, extData.VDATA + 3);
-
-        rsrcDesc.read();
-        offset.read();
-        data0.read();
-        data1.read();
-        data2.read();
-        data3.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
-                    = data0[lane];
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
-                    = data1[lane];
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 2]
-                    = data2[lane];
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 3]
-                    = data3[lane];
-            }
-        }
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<4>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_STORE_LDS_DWORD class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_LDS_DWORD
-        ::Inst_MUBUF__BUFFER_STORE_LDS_DWORD(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_lds_dword")
-    {
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_LDS_DWORD
-
-    Inst_MUBUF__BUFFER_STORE_LDS_DWORD::~Inst_MUBUF__BUFFER_STORE_LDS_DWORD()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_LDS_DWORD
-
-    // --- description from .arch file ---
-    // Store one DWORD from LDS memory to system memory without utilizing
-    // VGPRs.
-    void
-    Inst_MUBUF__BUFFER_STORE_LDS_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_WBINVL1 class methods ---
-
-    Inst_MUBUF__BUFFER_WBINVL1::Inst_MUBUF__BUFFER_WBINVL1(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_wbinvl1")
-    {
-        setFlag(MemoryRef);
-        setFlag(GPUStaticInst::MemSync);
-        setFlag(GlobalSegment);
-        setFlag(MemSync);
-    } // Inst_MUBUF__BUFFER_WBINVL1
-
-    Inst_MUBUF__BUFFER_WBINVL1::~Inst_MUBUF__BUFFER_WBINVL1()
-    {
-    } // ~Inst_MUBUF__BUFFER_WBINVL1
-
-    // --- description from .arch file ---
-    // Write back and invalidate the shader L1.
-    // Always returns ACK to shader.
-    void
-    Inst_MUBUF__BUFFER_WBINVL1::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-            gpuDynInst->computeUnit()->globalMemoryPipe.
-                issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_WBINVL1::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        // TODO: Fix it for gfx10. Once we have the new gfx10 cache model, we
-        // need to precisely communicate the writeback-invalidate operation to
-        // the new gfx10 coalescer rather than sending AcquireRelease markers.
-        // The SICoalescer would need to be updated appropriately as well.
-        injectGlobalMemFence(gpuDynInst);
-    } // initiateAcc
-    void
-    Inst_MUBUF__BUFFER_WBINVL1::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_WBINVL1_VOL class methods ---
-
-    Inst_MUBUF__BUFFER_WBINVL1_VOL
-        ::Inst_MUBUF__BUFFER_WBINVL1_VOL(InFmt_MUBUF*iFmt)
-        : Inst_MUBUF(iFmt, "buffer_wbinvl1_vol") {
-        // This instruction is same as buffer_wbinvl1 instruction except this
-        // instruction only invalidate L1 shader line with MTYPE SC and GC.
-        // Since Hermes L1 (TCP) do not differentiate between its cache lines,
-        // this instruction currently behaves (and implemented ) exactly like
-        // buffer_wbinvl1 instruction.
-        setFlag(MemoryRef);
-        setFlag(GPUStaticInst::MemSync);
-        setFlag(GlobalSegment);
-        setFlag(MemSync);
-    } // Inst_MUBUF__BUFFER_WBINVL1_VOL
-
-    Inst_MUBUF__BUFFER_WBINVL1_VOL::~Inst_MUBUF__BUFFER_WBINVL1_VOL()
-    {
-    } // ~Inst_MUBUF__BUFFER_WBINVL1_VOL
-
-    // --- description from .arch file ---
-    // Write back and invalidate the shader L1 only for lines that are marked
-    // ---  volatile.
-    // Always returns ACK to shader.
-    void
-    Inst_MUBUF__BUFFER_WBINVL1_VOL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-            gpuDynInst->computeUnit()->globalMemoryPipe.
-                issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    } // execute
-    void
-    Inst_MUBUF__BUFFER_WBINVL1_VOL::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        injectGlobalMemFence(gpuDynInst);
-    } // initiateAcc
-    void
-    Inst_MUBUF__BUFFER_WBINVL1_VOL::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_ATOMIC_SWAP class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_SWAP
-        ::Inst_MUBUF__BUFFER_ATOMIC_SWAP(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_swap")
-    {
-        setFlag(AtomicExch);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SWAP
-
-    Inst_MUBUF__BUFFER_ATOMIC_SWAP::~Inst_MUBUF__BUFFER_ATOMIC_SWAP()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SWAP
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SWAP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP
-        ::Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_cmpswap")
-    {
-        setFlag(AtomicCAS);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP
-
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // src = DATA[0];
-    // cmp = DATA[1];
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-        ConstVecOperandU32 src(gpuDynInst, extData.VDATA);
-        ConstVecOperandU32 cmp(gpuDynInst, extData.VDATA + 1);
-
-        rsrcDesc.read();
-        offset.read();
-        src.read();
-        cmp.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->x_data))[lane]
-                    = src[lane];
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
-                    = cmp[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        if (isAtomicRet()) {
-            VecOperandU32 vdst(gpuDynInst, extData.VDATA);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane];
-                }
-            }
-
-            vdst.write();
-        }
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_ATOMIC_ADD class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_ADD
-        ::Inst_MUBUF__BUFFER_ATOMIC_ADD(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_add")
-    {
-        setFlag(AtomicAdd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_ADD
-
-    Inst_MUBUF__BUFFER_ATOMIC_ADD::~Inst_MUBUF__BUFFER_ATOMIC_ADD()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_ADD
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_ADD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_SUB class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_SUB
-        ::Inst_MUBUF__BUFFER_ATOMIC_SUB(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_sub")
-    {
-        setFlag(AtomicSub);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SUB
-
-    Inst_MUBUF__BUFFER_ATOMIC_SUB::~Inst_MUBUF__BUFFER_ATOMIC_SUB()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SUB
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_SMIN class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMIN
-        ::Inst_MUBUF__BUFFER_ATOMIC_SMIN(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_smin")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SMIN
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMIN::~Inst_MUBUF__BUFFER_ATOMIC_SMIN()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMIN
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SMIN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_UMIN class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMIN
-        ::Inst_MUBUF__BUFFER_ATOMIC_UMIN(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_umin")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_UMIN
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMIN::~Inst_MUBUF__BUFFER_ATOMIC_UMIN()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMIN
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_UMIN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_SMAX class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMAX
-        ::Inst_MUBUF__BUFFER_ATOMIC_SMAX(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_smax")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SMAX
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMAX::~Inst_MUBUF__BUFFER_ATOMIC_SMAX()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMAX
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SMAX::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_UMAX class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMAX
-        ::Inst_MUBUF__BUFFER_ATOMIC_UMAX(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_umax")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_UMAX
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMAX::~Inst_MUBUF__BUFFER_ATOMIC_UMAX()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMAX
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_UMAX::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_AND class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_AND
-        ::Inst_MUBUF__BUFFER_ATOMIC_AND(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_and")
-    {
-        setFlag(AtomicAnd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_AND
-
-    Inst_MUBUF__BUFFER_ATOMIC_AND::~Inst_MUBUF__BUFFER_ATOMIC_AND()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_AND
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_AND::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_OR class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_OR
-        ::Inst_MUBUF__BUFFER_ATOMIC_OR(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_or")
-    {
-        setFlag(AtomicOr);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_OR
-
-    Inst_MUBUF__BUFFER_ATOMIC_OR::~Inst_MUBUF__BUFFER_ATOMIC_OR()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_OR
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_OR::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_XOR class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_XOR
-        ::Inst_MUBUF__BUFFER_ATOMIC_XOR(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_xor")
-    {
-        setFlag(AtomicXor);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_XOR
-
-    Inst_MUBUF__BUFFER_ATOMIC_XOR::~Inst_MUBUF__BUFFER_ATOMIC_XOR()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_XOR
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_XOR::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_INC class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_INC
-        ::Inst_MUBUF__BUFFER_ATOMIC_INC(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_inc")
-    {
-        setFlag(AtomicInc);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_INC
-
-    Inst_MUBUF__BUFFER_ATOMIC_INC::~Inst_MUBUF__BUFFER_ATOMIC_INC()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_INC
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_DEC class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_DEC
-        ::Inst_MUBUF__BUFFER_ATOMIC_DEC(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_dec")
-    {
-        setFlag(AtomicDec);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_DEC
-
-    Inst_MUBUF__BUFFER_ATOMIC_DEC::~Inst_MUBUF__BUFFER_ATOMIC_DEC()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_DEC
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
-    // (unsigned compare); RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_swap_x2")
-    {
-        setFlag(AtomicExch);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2::~Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_cmpswap_x2")
-    {
-        setFlag(AtomicCAS);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
-        ::~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // src = DATA[0:1];
-    // cmp = DATA[2:3];
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_ADD_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_ADD_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_ADD_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_add_x2")
-    {
-        setFlag(AtomicAdd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_ADD_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_ADD_X2::~Inst_MUBUF__BUFFER_ATOMIC_ADD_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_ADD_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_ADD_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_SUB_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_SUB_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_SUB_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_sub_x2")
-    {
-        setFlag(AtomicSub);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SUB_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_SUB_X2::~Inst_MUBUF__BUFFER_ATOMIC_SUB_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SUB_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SUB_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_smin_x2")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2::~Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_umin_x2")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2::~Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_smax_x2")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2::~Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_umax_x2")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2::~Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_AND_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_AND_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_AND_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_and_x2")
-    {
-        setFlag(AtomicAnd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_AND_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_AND_X2::~Inst_MUBUF__BUFFER_ATOMIC_AND_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_AND_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_AND_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_OR_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_OR_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_OR_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_or_x2")
-    {
-        setFlag(AtomicOr);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-    } // Inst_MUBUF__BUFFER_ATOMIC_OR_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_OR_X2::~Inst_MUBUF__BUFFER_ATOMIC_OR_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_OR_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_OR_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_XOR_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_XOR_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_XOR_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_xor_x2")
-    {
-        setFlag(AtomicXor);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_XOR_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_XOR_X2::~Inst_MUBUF__BUFFER_ATOMIC_XOR_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_XOR_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_XOR_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_INC_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_INC_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_INC_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_inc_x2")
-    {
-        setFlag(AtomicInc);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_INC_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_INC_X2::~Inst_MUBUF__BUFFER_ATOMIC_INC_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_INC_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_INC_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_DEC_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_DEC_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_DEC_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_dec_x2")
-    {
-        setFlag(AtomicDec);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_DEC_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_DEC_X2::~Inst_MUBUF__BUFFER_ATOMIC_DEC_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_DEC_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
-    // (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_DEC_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_X class methods ---
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_X(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_x")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_X()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
-
-    // --- description from .arch file ---
-    // Typed buffer load 1 dword with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY class methods ---
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
-
-    // --- description from .arch file ---
-    // Typed buffer load 2 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ class methods ---
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
-
-    // --- description from .arch file ---
-    // Typed buffer load 3 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW class methods ---
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
-        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
-
-    // --- description from .arch file ---
-    // Typed buffer load 4 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_X class methods ---
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_X
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_X(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_x")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_X
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::~Inst_MTBUF__TBUFFER_STORE_FORMAT_X()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_X
-
-    // --- description from .arch file ---
-    // Typed buffer store 1 dword with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_XY class methods ---
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XY(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XY()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
-
-    // --- description from .arch file ---
-    // Typed buffer store 2 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ class methods ---
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
-        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
-
-    // --- description from .arch file ---
-    // Typed buffer store 3 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW class methods ---
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
-        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
-
-    // --- description from .arch file ---
-    // Typed buffer store 4 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X class methods ---
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_x")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::
-        ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
-
-    // --- description from .arch file ---
-    // Typed buffer load 1 dword with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::initiateAcc(
-          GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY class methods ---
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
-        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
-
-    // --- description from .arch file ---
-    // Typed buffer load 2 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ class methods ---
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ(
-          InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
-        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
-
-    // --- description from .arch file ---
-    // Typed buffer load 3 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW class methods ---
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW(
-          InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
-        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
-
-    // --- description from .arch file ---
-    // Typed buffer load 4 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X class methods ---
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_x")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
-        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
-
-    // --- description from .arch file ---
-    // Typed buffer store 1 dword with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY class methods ---
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
-        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
-
-    // --- description from .arch file ---
-    // Typed buffer store 2 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ class methods ---
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
-        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
-
-    // --- description from .arch file ---
-    // Typed buffer store 3 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::initiateAcc(
-          GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW class methods ---
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
-        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
-
-    // --- description from .arch file ---
-    // Typed buffer store 4 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::execute(
-        GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_LOAD class methods ---
-
-    Inst_MIMG__IMAGE_LOAD::Inst_MIMG__IMAGE_LOAD(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_load")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_LOAD
-
-    Inst_MIMG__IMAGE_LOAD::~Inst_MIMG__IMAGE_LOAD()
-    {
-    } // ~Inst_MIMG__IMAGE_LOAD
-
-    // --- description from .arch file ---
-    // Image memory load with format conversion specified in T#. No sampler.
-    void
-    Inst_MIMG__IMAGE_LOAD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_LOAD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_LOAD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_LOAD_MIP class methods ---
-
-    Inst_MIMG__IMAGE_LOAD_MIP::Inst_MIMG__IMAGE_LOAD_MIP(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_load_mip")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_LOAD_MIP
-
-    Inst_MIMG__IMAGE_LOAD_MIP::~Inst_MIMG__IMAGE_LOAD_MIP()
-    {
-    } // ~Inst_MIMG__IMAGE_LOAD_MIP
-
-    // --- description from .arch file ---
-    // Image memory load with user-supplied mip level. No sampler.
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_LOAD_PCK class methods ---
-
-    Inst_MIMG__IMAGE_LOAD_PCK::Inst_MIMG__IMAGE_LOAD_PCK(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_load_pck")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_LOAD_PCK
-
-    Inst_MIMG__IMAGE_LOAD_PCK::~Inst_MIMG__IMAGE_LOAD_PCK()
-    {
-    } // ~Inst_MIMG__IMAGE_LOAD_PCK
-
-    // --- description from .arch file ---
-    // Image memory load with no format conversion. No sampler.
-    void
-    Inst_MIMG__IMAGE_LOAD_PCK::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_LOAD_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_LOAD_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_LOAD_PCK_SGN class methods ---
-
-    Inst_MIMG__IMAGE_LOAD_PCK_SGN::Inst_MIMG__IMAGE_LOAD_PCK_SGN(
-        InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_load_pck_sgn")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_LOAD_PCK_SGN
-
-    Inst_MIMG__IMAGE_LOAD_PCK_SGN::~Inst_MIMG__IMAGE_LOAD_PCK_SGN()
-    {
-    } // ~Inst_MIMG__IMAGE_LOAD_PCK_SGN
-
-    // --- description from .arch file ---
-    // Image memory load with with no format conversion and sign extension. No
-    // ---  sampler.
-    void
-    Inst_MIMG__IMAGE_LOAD_PCK_SGN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_LOAD_PCK_SGN::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_LOAD_PCK_SGN::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_LOAD_MIP_PCK class methods ---
-
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK::Inst_MIMG__IMAGE_LOAD_MIP_PCK(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_load_mip_pck")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_LOAD_MIP_PCK
-
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK::~Inst_MIMG__IMAGE_LOAD_MIP_PCK()
-    {
-    } // ~Inst_MIMG__IMAGE_LOAD_MIP_PCK
-
-    // --- description from .arch file ---
-    // Image memory load with user-supplied mip level, no format conversion. No
-    // ---  sampler.
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN class methods ---
-
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN(
-        InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_load_mip_pck_sgn")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN
-
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::~Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN()
-    {
-    } // ~Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN
-
-    // --- description from .arch file ---
-    // Image memory load with user-supplied mip level, no format conversion and
-    // ---  with sign extension. No sampler.
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_STORE class methods ---
-
-    Inst_MIMG__IMAGE_STORE::Inst_MIMG__IMAGE_STORE(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_store")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_STORE
-
-    Inst_MIMG__IMAGE_STORE::~Inst_MIMG__IMAGE_STORE()
-    {
-    } // ~Inst_MIMG__IMAGE_STORE
-
-    // --- description from .arch file ---
-    // Image memory store with format conversion specified in T#. No sampler.
-    void
-    Inst_MIMG__IMAGE_STORE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_STORE::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_STORE::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_STORE_MIP class methods ---
-
-    Inst_MIMG__IMAGE_STORE_MIP::Inst_MIMG__IMAGE_STORE_MIP(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_store_mip")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_STORE_MIP
-
-    Inst_MIMG__IMAGE_STORE_MIP::~Inst_MIMG__IMAGE_STORE_MIP()
-    {
-    } // ~Inst_MIMG__IMAGE_STORE_MIP
-
-    // --- description from .arch file ---
-    // Image memory store with format conversion specified in T# to user
-    // specified mip level. No sampler.
-    void
-    Inst_MIMG__IMAGE_STORE_MIP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_STORE_MIP::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_STORE_MIP::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_STORE_PCK class methods ---
-
-    Inst_MIMG__IMAGE_STORE_PCK::Inst_MIMG__IMAGE_STORE_PCK(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_store_pck")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_STORE_PCK
-
-    Inst_MIMG__IMAGE_STORE_PCK::~Inst_MIMG__IMAGE_STORE_PCK()
-    {
-    } // ~Inst_MIMG__IMAGE_STORE_PCK
-
-    // --- description from .arch file ---
-    // Image memory store of packed data without format conversion. No sampler.
-    void
-    Inst_MIMG__IMAGE_STORE_PCK::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_STORE_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_STORE_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_STORE_MIP_PCK class methods ---
-
-    Inst_MIMG__IMAGE_STORE_MIP_PCK::Inst_MIMG__IMAGE_STORE_MIP_PCK(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_store_mip_pck")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_STORE_MIP_PCK
-
-    Inst_MIMG__IMAGE_STORE_MIP_PCK::~Inst_MIMG__IMAGE_STORE_MIP_PCK()
-    {
-    } // ~Inst_MIMG__IMAGE_STORE_MIP_PCK
-
-    // --- description from .arch file ---
-    // Image memory store of packed data without format conversion to
-    // user-supplied mip level. No sampler.
-    void
-    Inst_MIMG__IMAGE_STORE_MIP_PCK::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_STORE_MIP_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_STORE_MIP_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_GET_RESINFO class methods ---
-
-    Inst_MIMG__IMAGE_GET_RESINFO::Inst_MIMG__IMAGE_GET_RESINFO(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_get_resinfo")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GET_RESINFO
-
-    Inst_MIMG__IMAGE_GET_RESINFO::~Inst_MIMG__IMAGE_GET_RESINFO()
-    {
-    } // ~Inst_MIMG__IMAGE_GET_RESINFO
-
-    // --- description from .arch file ---
-    // return resource info for a given mip level specified in the address
-    // vgpr. No sampler. Returns 4 integer values into VGPRs 3-0:
-    // {num_mip_levels, depth, height, width}.
-    void
-    Inst_MIMG__IMAGE_GET_RESINFO::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_SWAP class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_SWAP::Inst_MIMG__IMAGE_ATOMIC_SWAP(
-        InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_swap")
-    {
-        setFlag(AtomicExch);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_SWAP
-
-    Inst_MIMG__IMAGE_ATOMIC_SWAP::~Inst_MIMG__IMAGE_ATOMIC_SWAP()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_SWAP
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_SWAP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_CMPSWAP class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_CMPSWAP::Inst_MIMG__IMAGE_ATOMIC_CMPSWAP(
-        InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_cmpswap")
-    {
-        setFlag(AtomicCAS);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_CMPSWAP
-
-    Inst_MIMG__IMAGE_ATOMIC_CMPSWAP::~Inst_MIMG__IMAGE_ATOMIC_CMPSWAP()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_CMPSWAP
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // src = DATA[0];
-    // cmp = DATA[1];
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_CMPSWAP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_ADD class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_ADD::Inst_MIMG__IMAGE_ATOMIC_ADD(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_add")
-    {
-        setFlag(AtomicAdd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_ADD
-
-    Inst_MIMG__IMAGE_ATOMIC_ADD::~Inst_MIMG__IMAGE_ATOMIC_ADD()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_ADD
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_ADD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_SUB class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_SUB::Inst_MIMG__IMAGE_ATOMIC_SUB(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_sub")
-    {
-        setFlag(AtomicSub);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_SUB
-
-    Inst_MIMG__IMAGE_ATOMIC_SUB::~Inst_MIMG__IMAGE_ATOMIC_SUB()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_SUB
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_SMIN class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_SMIN::Inst_MIMG__IMAGE_ATOMIC_SMIN(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_smin")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_SMIN
-
-    Inst_MIMG__IMAGE_ATOMIC_SMIN::~Inst_MIMG__IMAGE_ATOMIC_SMIN()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_SMIN
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_SMIN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_UMIN class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_UMIN::Inst_MIMG__IMAGE_ATOMIC_UMIN(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_umin")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_UMIN
-
-    Inst_MIMG__IMAGE_ATOMIC_UMIN::~Inst_MIMG__IMAGE_ATOMIC_UMIN()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_UMIN
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_UMIN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_SMAX class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_SMAX::Inst_MIMG__IMAGE_ATOMIC_SMAX(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_smax")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_SMAX
-
-    Inst_MIMG__IMAGE_ATOMIC_SMAX::~Inst_MIMG__IMAGE_ATOMIC_SMAX()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_SMAX
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_SMAX::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_UMAX class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_UMAX::Inst_MIMG__IMAGE_ATOMIC_UMAX(
-        InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_umax")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_UMAX
-
-    Inst_MIMG__IMAGE_ATOMIC_UMAX::~Inst_MIMG__IMAGE_ATOMIC_UMAX()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_UMAX
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_UMAX::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_AND class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_AND::Inst_MIMG__IMAGE_ATOMIC_AND(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_and")
-    {
-        setFlag(AtomicAnd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_AND
-
-    Inst_MIMG__IMAGE_ATOMIC_AND::~Inst_MIMG__IMAGE_ATOMIC_AND()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_AND
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_AND::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_OR class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_OR::Inst_MIMG__IMAGE_ATOMIC_OR(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_or")
-    {
-        setFlag(AtomicOr);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_OR
-
-    Inst_MIMG__IMAGE_ATOMIC_OR::~Inst_MIMG__IMAGE_ATOMIC_OR()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_OR
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_OR::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_XOR class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_XOR::Inst_MIMG__IMAGE_ATOMIC_XOR(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_xor")
-    {
-        setFlag(AtomicXor);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_XOR
-
-    Inst_MIMG__IMAGE_ATOMIC_XOR::~Inst_MIMG__IMAGE_ATOMIC_XOR()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_XOR
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_XOR::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_INC class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_INC::Inst_MIMG__IMAGE_ATOMIC_INC(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_inc")
-    {
-        setFlag(AtomicInc);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_INC
-
-    Inst_MIMG__IMAGE_ATOMIC_INC::~Inst_MIMG__IMAGE_ATOMIC_INC()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_INC
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_DEC class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_DEC::Inst_MIMG__IMAGE_ATOMIC_DEC(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_dec")
-    {
-        setFlag(AtomicDec);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_DEC
-
-    Inst_MIMG__IMAGE_ATOMIC_DEC::~Inst_MIMG__IMAGE_ATOMIC_DEC()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_DEC
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
-    // (unsigned compare); RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE::Inst_MIMG__IMAGE_SAMPLE(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample")
-    {
-    } // Inst_MIMG__IMAGE_SAMPLE
-
-    Inst_MIMG__IMAGE_SAMPLE::~Inst_MIMG__IMAGE_SAMPLE()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE
-
-    // --- description from .arch file ---
-    // sample texture map.
-    void
-    Inst_MIMG__IMAGE_SAMPLE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_CL class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_CL::Inst_MIMG__IMAGE_SAMPLE_CL(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_CL::~Inst_MIMG__IMAGE_SAMPLE_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_CL
-
-    // --- description from .arch file ---
-    // sample texture map, with LOD clamp specified in shader.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_D class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_D::Inst_MIMG__IMAGE_SAMPLE_D(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_d")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_D
-
-    Inst_MIMG__IMAGE_SAMPLE_D::~Inst_MIMG__IMAGE_SAMPLE_D()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_D
-
-    // --- description from .arch file ---
-    // sample texture map, with user derivatives
-    void
-    Inst_MIMG__IMAGE_SAMPLE_D::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_D_CL class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_D_CL::Inst_MIMG__IMAGE_SAMPLE_D_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_d_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_D_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_D_CL::~Inst_MIMG__IMAGE_SAMPLE_D_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_D_CL
-
-    // --- description from .arch file ---
-    // sample texture map, with LOD clamp specified in shader, with user
-    // ---  derivatives.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_D_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_L class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_L::Inst_MIMG__IMAGE_SAMPLE_L(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_l")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_L
-
-    Inst_MIMG__IMAGE_SAMPLE_L::~Inst_MIMG__IMAGE_SAMPLE_L()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_L
-
-    // --- description from .arch file ---
-    // sample texture map, with user LOD.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_L::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_B class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_B::Inst_MIMG__IMAGE_SAMPLE_B(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_b")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_B
-
-    Inst_MIMG__IMAGE_SAMPLE_B::~Inst_MIMG__IMAGE_SAMPLE_B()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_B
-
-    // --- description from .arch file ---
-    // sample texture map, with lod bias.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_B::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_B_CL class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_B_CL::Inst_MIMG__IMAGE_SAMPLE_B_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_b_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_B_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_B_CL::~Inst_MIMG__IMAGE_SAMPLE_B_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_B_CL
-
-    // --- description from .arch file ---
-    // sample texture map, with LOD clamp specified in shader, with lod bias.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_B_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_LZ class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_LZ::Inst_MIMG__IMAGE_SAMPLE_LZ(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_lz")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_LZ
-
-    Inst_MIMG__IMAGE_SAMPLE_LZ::~Inst_MIMG__IMAGE_SAMPLE_LZ()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_LZ
-
-    // --- description from .arch file ---
-    // sample texture map, from level 0.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_LZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C::Inst_MIMG__IMAGE_SAMPLE_C(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C
-
-    Inst_MIMG__IMAGE_SAMPLE_C::~Inst_MIMG__IMAGE_SAMPLE_C()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C
-
-    // --- description from .arch file ---
-    // sample texture map, with PCF.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_CL class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CL::Inst_MIMG__IMAGE_SAMPLE_C_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CL::~Inst_MIMG__IMAGE_SAMPLE_C_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CL
-
-    // --- description from .arch file ---
-    // SAMPLE_C, with LOD clamp specified in shader.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_D class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D::Inst_MIMG__IMAGE_SAMPLE_C_D(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_d")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_D
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D::~Inst_MIMG__IMAGE_SAMPLE_C_D()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D
-
-    // --- description from .arch file ---
-    // SAMPLE_C, with user derivatives.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_D::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_D_CL class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D_CL::Inst_MIMG__IMAGE_SAMPLE_C_D_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_d_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_D_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D_CL::~Inst_MIMG__IMAGE_SAMPLE_C_D_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D_CL
-
-    // --- description from .arch file ---
-    // SAMPLE_C, with LOD clamp specified in shader, with user derivatives.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_D_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_L class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_L::Inst_MIMG__IMAGE_SAMPLE_C_L(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_l")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_L
-
-    Inst_MIMG__IMAGE_SAMPLE_C_L::~Inst_MIMG__IMAGE_SAMPLE_C_L()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_L
-
-    // --- description from .arch file ---
-    // SAMPLE_C, with user LOD.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_L::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_B class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B::Inst_MIMG__IMAGE_SAMPLE_C_B(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_b")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_B
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B::~Inst_MIMG__IMAGE_SAMPLE_C_B()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B
-
-    // --- description from .arch file ---
-    // SAMPLE_C, with lod bias.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_B::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_B_CL class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B_CL::Inst_MIMG__IMAGE_SAMPLE_C_B_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_b_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_B_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B_CL::~Inst_MIMG__IMAGE_SAMPLE_C_B_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B_CL
-
-    // --- description from .arch file ---
-    // SAMPLE_C, with LOD clamp specified in shader, with lod bias.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_B_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_LZ class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_LZ::Inst_MIMG__IMAGE_SAMPLE_C_LZ(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_lz")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_LZ
-
-    Inst_MIMG__IMAGE_SAMPLE_C_LZ::~Inst_MIMG__IMAGE_SAMPLE_C_LZ()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_LZ
-
-    // --- description from .arch file ---
-    // SAMPLE_C, from level 0.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_LZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_O::Inst_MIMG__IMAGE_SAMPLE_O(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_O
-
-    Inst_MIMG__IMAGE_SAMPLE_O::~Inst_MIMG__IMAGE_SAMPLE_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_O
-
-    // --- description from .arch file ---
-    // sample texture map, with user offsets.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_CL_O::Inst_MIMG__IMAGE_SAMPLE_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_CL_O::~Inst_MIMG__IMAGE_SAMPLE_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_CL_O
-
-    // --- description from .arch file ---
-    // SAMPLE_O with LOD clamp specified in shader.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_D_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_D_O::Inst_MIMG__IMAGE_SAMPLE_D_O(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_d_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_D_O
-
-    Inst_MIMG__IMAGE_SAMPLE_D_O::~Inst_MIMG__IMAGE_SAMPLE_D_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_D_O
-
-    // --- description from .arch file ---
-    // SAMPLE_O, with user derivatives.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_D_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_D_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_D_CL_O::Inst_MIMG__IMAGE_SAMPLE_D_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_d_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_D_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_D_CL_O::~Inst_MIMG__IMAGE_SAMPLE_D_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_D_CL_O
-
-    // --- description from .arch file ---
-    // SAMPLE_O, with LOD clamp specified in shader, with user derivatives.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_D_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_L_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_L_O::Inst_MIMG__IMAGE_SAMPLE_L_O(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_l_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_L_O
-
-    Inst_MIMG__IMAGE_SAMPLE_L_O::~Inst_MIMG__IMAGE_SAMPLE_L_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_L_O
-
-    // --- description from .arch file ---
-    // SAMPLE_O, with user LOD.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_L_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_B_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_B_O::Inst_MIMG__IMAGE_SAMPLE_B_O(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_b_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_B_O
-
-    Inst_MIMG__IMAGE_SAMPLE_B_O::~Inst_MIMG__IMAGE_SAMPLE_B_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_B_O
-
-    // --- description from .arch file ---
-    // SAMPLE_O, with lod bias.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_B_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_B_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_B_CL_O::Inst_MIMG__IMAGE_SAMPLE_B_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_b_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_B_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_B_CL_O::~Inst_MIMG__IMAGE_SAMPLE_B_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_B_CL_O
-
-    // --- description from .arch file ---
-    // SAMPLE_O, with LOD clamp specified in shader, with lod bias.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_LZ_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_LZ_O::Inst_MIMG__IMAGE_SAMPLE_LZ_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_lz_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_LZ_O
-
-    Inst_MIMG__IMAGE_SAMPLE_LZ_O::~Inst_MIMG__IMAGE_SAMPLE_LZ_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_LZ_O
-
-    // --- description from .arch file ---
-    // SAMPLE_O, from level 0.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_LZ_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_O::Inst_MIMG__IMAGE_SAMPLE_C_O(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_O::~Inst_MIMG__IMAGE_SAMPLE_C_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C with user specified offsets.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CL_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C_O, with LOD clamp specified in shader.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_D_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D_O::Inst_MIMG__IMAGE_SAMPLE_C_D_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_d_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_D_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D_O::~Inst_MIMG__IMAGE_SAMPLE_C_D_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C_O, with user derivatives.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_D_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_d_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C_O, with LOD clamp specified in shader, with user derivatives.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_L_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_L_O::Inst_MIMG__IMAGE_SAMPLE_C_L_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_l_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_L_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_L_O::~Inst_MIMG__IMAGE_SAMPLE_C_L_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_L_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C_O, with user LOD.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_L_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_B_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B_O::Inst_MIMG__IMAGE_SAMPLE_C_B_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_b_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_B_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B_O::~Inst_MIMG__IMAGE_SAMPLE_C_B_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C_O, with lod bias.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_B_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_b_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C_O, with LOD clamp specified in shader, with lod bias.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_LZ_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_LZ_O::Inst_MIMG__IMAGE_SAMPLE_C_LZ_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_lz_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_LZ_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_LZ_O::~Inst_MIMG__IMAGE_SAMPLE_C_LZ_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_LZ_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C_O, from level 0.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_LZ_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4 class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4::Inst_MIMG__IMAGE_GATHER4(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4
-
-    Inst_MIMG__IMAGE_GATHER4::~Inst_MIMG__IMAGE_GATHER4()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2).
-    void
-    Inst_MIMG__IMAGE_GATHER4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_CL class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_CL::Inst_MIMG__IMAGE_GATHER4_CL(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_CL
-
-    Inst_MIMG__IMAGE_GATHER4_CL::~Inst_MIMG__IMAGE_GATHER4_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_CL
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) with user LOD clamp.
-    void
-    Inst_MIMG__IMAGE_GATHER4_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_L class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_L::Inst_MIMG__IMAGE_GATHER4_L(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_l")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_L
-
-    Inst_MIMG__IMAGE_GATHER4_L::~Inst_MIMG__IMAGE_GATHER4_L()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_L
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) with user LOD.
-    void
-    Inst_MIMG__IMAGE_GATHER4_L::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_B class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_B::Inst_MIMG__IMAGE_GATHER4_B(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_b")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_B
-
-    Inst_MIMG__IMAGE_GATHER4_B::~Inst_MIMG__IMAGE_GATHER4_B()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_B
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) with user bias.
-    void
-    Inst_MIMG__IMAGE_GATHER4_B::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_B_CL class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_B_CL::Inst_MIMG__IMAGE_GATHER4_B_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_b_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_B_CL
-
-    Inst_MIMG__IMAGE_GATHER4_B_CL::~Inst_MIMG__IMAGE_GATHER4_B_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_B_CL
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) with user bias and clamp.
-    void
-    Inst_MIMG__IMAGE_GATHER4_B_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_LZ class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_LZ::Inst_MIMG__IMAGE_GATHER4_LZ(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_lz")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_LZ
-
-    Inst_MIMG__IMAGE_GATHER4_LZ::~Inst_MIMG__IMAGE_GATHER4_LZ()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_LZ
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) at level 0.
-    void
-    Inst_MIMG__IMAGE_GATHER4_LZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C::Inst_MIMG__IMAGE_GATHER4_C(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C
-
-    Inst_MIMG__IMAGE_GATHER4_C::~Inst_MIMG__IMAGE_GATHER4_C()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) with PCF.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_CL class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_CL::Inst_MIMG__IMAGE_GATHER4_C_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_CL
-
-    Inst_MIMG__IMAGE_GATHER4_C_CL::~Inst_MIMG__IMAGE_GATHER4_C_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_CL
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) with user LOD clamp and PCF.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_L class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_L::Inst_MIMG__IMAGE_GATHER4_C_L(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_l")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_L
-
-    Inst_MIMG__IMAGE_GATHER4_C_L::~Inst_MIMG__IMAGE_GATHER4_C_L()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_L
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) with user LOD and PCF.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_L::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_B class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_B::Inst_MIMG__IMAGE_GATHER4_C_B(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_b")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_B
-
-    Inst_MIMG__IMAGE_GATHER4_C_B::~Inst_MIMG__IMAGE_GATHER4_C_B()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_B
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) with user bias and PCF.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_B::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_B_CL class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_B_CL::Inst_MIMG__IMAGE_GATHER4_C_B_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_b_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_B_CL
-
-    Inst_MIMG__IMAGE_GATHER4_C_B_CL::~Inst_MIMG__IMAGE_GATHER4_C_B_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_B_CL
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) with user bias, clamp and PCF.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_B_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_LZ class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_LZ::Inst_MIMG__IMAGE_GATHER4_C_LZ(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_lz")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_LZ
-
-    Inst_MIMG__IMAGE_GATHER4_C_LZ::~Inst_MIMG__IMAGE_GATHER4_C_LZ()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_LZ
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) at level 0, with PCF.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_LZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_O::Inst_MIMG__IMAGE_GATHER4_O(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_O
-
-    Inst_MIMG__IMAGE_GATHER4_O::~Inst_MIMG__IMAGE_GATHER4_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_O
-
-    // --- description from .arch file ---
-    // GATHER4, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_CL_O::Inst_MIMG__IMAGE_GATHER4_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_CL_O
-
-    Inst_MIMG__IMAGE_GATHER4_CL_O::~Inst_MIMG__IMAGE_GATHER4_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_CL_O
-
-    // --- description from .arch file ---
-    // GATHER4_CL, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_L_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_L_O::Inst_MIMG__IMAGE_GATHER4_L_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_l_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_L_O
-
-    Inst_MIMG__IMAGE_GATHER4_L_O::~Inst_MIMG__IMAGE_GATHER4_L_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_L_O
-
-    // --- description from .arch file ---
-    // GATHER4_L, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_L_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_B_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_B_O::Inst_MIMG__IMAGE_GATHER4_B_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_b_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_B_O
-
-    Inst_MIMG__IMAGE_GATHER4_B_O::~Inst_MIMG__IMAGE_GATHER4_B_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_B_O
-
-    // --- description from .arch file ---
-    // GATHER4_B, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_B_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_B_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_B_CL_O::Inst_MIMG__IMAGE_GATHER4_B_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_b_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_B_CL_O
-
-    Inst_MIMG__IMAGE_GATHER4_B_CL_O::~Inst_MIMG__IMAGE_GATHER4_B_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_B_CL_O
-
-    // --- description from .arch file ---
-    // GATHER4_B_CL, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_LZ_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_LZ_O::Inst_MIMG__IMAGE_GATHER4_LZ_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_lz_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_LZ_O
-
-    Inst_MIMG__IMAGE_GATHER4_LZ_O::~Inst_MIMG__IMAGE_GATHER4_LZ_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_LZ_O
-
-    // --- description from .arch file ---
-    // GATHER4_LZ, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_LZ_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_O::Inst_MIMG__IMAGE_GATHER4_C_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_O
-
-    Inst_MIMG__IMAGE_GATHER4_C_O::~Inst_MIMG__IMAGE_GATHER4_C_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_O
-
-    // --- description from .arch file ---
-    // GATHER4_C, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_CL_O::Inst_MIMG__IMAGE_GATHER4_C_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_CL_O
-
-    Inst_MIMG__IMAGE_GATHER4_C_CL_O::~Inst_MIMG__IMAGE_GATHER4_C_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_CL_O
-
-    // --- description from .arch file ---
-    // GATHER4_C_CL, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_L_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_L_O::Inst_MIMG__IMAGE_GATHER4_C_L_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_l_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_L_O
-
-    Inst_MIMG__IMAGE_GATHER4_C_L_O::~Inst_MIMG__IMAGE_GATHER4_C_L_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_L_O
-
-    // --- description from .arch file ---
-    // GATHER4_C_L, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_L_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_B_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_B_O::Inst_MIMG__IMAGE_GATHER4_C_B_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_b_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_B_O
-
-    Inst_MIMG__IMAGE_GATHER4_C_B_O::~Inst_MIMG__IMAGE_GATHER4_C_B_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_B_O
-
-    // --- description from .arch file ---
-    // GATHER4_B, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_B_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_B_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_B_CL_O::Inst_MIMG__IMAGE_GATHER4_C_B_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_b_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_B_CL_O
-
-    Inst_MIMG__IMAGE_GATHER4_C_B_CL_O::~Inst_MIMG__IMAGE_GATHER4_C_B_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_B_CL_O
-
-    // --- description from .arch file ---
-    // GATHER4_B_CL, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_LZ_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_LZ_O::Inst_MIMG__IMAGE_GATHER4_C_LZ_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_lz_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_LZ_O
-
-    Inst_MIMG__IMAGE_GATHER4_C_LZ_O::~Inst_MIMG__IMAGE_GATHER4_C_LZ_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_LZ_O
-
-    // --- description from .arch file ---
-    // GATHER4_C_LZ, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_LZ_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GET_LOD class methods ---
-
-    Inst_MIMG__IMAGE_GET_LOD::Inst_MIMG__IMAGE_GET_LOD(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_get_lod")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GET_LOD
-
-    Inst_MIMG__IMAGE_GET_LOD::~Inst_MIMG__IMAGE_GET_LOD()
-    {
-    } // ~Inst_MIMG__IMAGE_GET_LOD
-
-    // --- description from .arch file ---
-    // Return calculated LOD. Vdata gets 2 32bit integer values: { rawLOD,
-    // ---  clampedLOD }.
-    void
-    Inst_MIMG__IMAGE_GET_LOD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_CD class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_CD::Inst_MIMG__IMAGE_SAMPLE_CD(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_cd")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_CD
-
-    Inst_MIMG__IMAGE_SAMPLE_CD::~Inst_MIMG__IMAGE_SAMPLE_CD()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_CD
-
-    // --- description from .arch file ---
-    // sample texture map, with user derivatives (LOD per quad)
-    void
-    Inst_MIMG__IMAGE_SAMPLE_CD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_CD_CL class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_CD_CL::Inst_MIMG__IMAGE_SAMPLE_CD_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_cd_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_CD_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_CD_CL::~Inst_MIMG__IMAGE_SAMPLE_CD_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_CD_CL
-
-    // --- description from .arch file ---
-    // sample texture map, with LOD clamp specified in shader, with user
-    // ---  derivatives (LOD per quad).
-    void
-    Inst_MIMG__IMAGE_SAMPLE_CD_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_CD class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD::Inst_MIMG__IMAGE_SAMPLE_C_CD(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_cd")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_CD
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD::~Inst_MIMG__IMAGE_SAMPLE_C_CD()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD
-
-    // --- description from .arch file ---
-    // SAMPLE_C, with user derivatives (LOD per quad).
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_CD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_CD_CL class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL::Inst_MIMG__IMAGE_SAMPLE_C_CD_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_cd_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_CD_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL::~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL
-
-    // --- description from .arch file ---
-    // SAMPLE_C, with LOD clamp specified in shader, with user derivatives
-    // (LOD per quad).
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_CD_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_CD_O::Inst_MIMG__IMAGE_SAMPLE_CD_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_cd_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_CD_O
-
-    Inst_MIMG__IMAGE_SAMPLE_CD_O::~Inst_MIMG__IMAGE_SAMPLE_CD_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_CD_O
-
-    // --- description from .arch file ---
-    // SAMPLE_O, with user derivatives (LOD per quad).
-    void
-    Inst_MIMG__IMAGE_SAMPLE_CD_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_CD_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_CD_CL_O::Inst_MIMG__IMAGE_SAMPLE_CD_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_cd_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_CD_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_CD_CL_O::~Inst_MIMG__IMAGE_SAMPLE_CD_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_CD_CL_O
-
-    // --- description from .arch file ---
-    // SAMPLE_O, with LOD clamp specified in shader, with user derivatives
-    // (LOD per quad).
-    void
-    Inst_MIMG__IMAGE_SAMPLE_CD_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_CD_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_O::Inst_MIMG__IMAGE_SAMPLE_C_CD_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_cd_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_CD_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_O::~Inst_MIMG__IMAGE_SAMPLE_C_CD_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C_O, with user derivatives (LOD per quad).
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_cd_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C_O, with LOD clamp specified in shader, with user derivatives
-    // (LOD per quad).
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_EXP__EXP class methods ---
-
-    Inst_EXP__EXP::Inst_EXP__EXP(InFmt_EXP *iFmt)
-        : Inst_EXP(iFmt, "exp")
-    {
-    } // Inst_EXP__EXP
-
-    Inst_EXP__EXP::~Inst_EXP__EXP()
-    {
-    } // ~Inst_EXP__EXP
-
-    // --- description from .arch file ---
-    // Export through SX.
-    void
-    Inst_EXP__EXP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_FLAT__FLAT_LOAD_UBYTE class methods ---
-
-    Inst_FLAT__FLAT_LOAD_UBYTE::Inst_FLAT__FLAT_LOAD_UBYTE(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_ubyte")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_UBYTE
-
-    Inst_FLAT__FLAT_LOAD_UBYTE::~Inst_FLAT__FLAT_LOAD_UBYTE()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_UBYTE
-
-    // --- description from .arch file ---
-    // Untyped buffer load unsigned byte (zero extend to VGPR destination).
-    void
-    Inst_FLAT__FLAT_LOAD_UBYTE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_LOAD_UBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<VecElemU8>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_UBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU8*>(
-                    gpuDynInst->d_data))[lane]);
-            }
-        }
-        vdst.write();
-    } // execute
-    // --- Inst_FLAT__FLAT_LOAD_SBYTE class methods ---
-
-    Inst_FLAT__FLAT_LOAD_SBYTE::Inst_FLAT__FLAT_LOAD_SBYTE(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_sbyte")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_SBYTE
-
-    Inst_FLAT__FLAT_LOAD_SBYTE::~Inst_FLAT__FLAT_LOAD_SBYTE()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_SBYTE
-
-    // --- description from .arch file ---
-    // Untyped buffer load signed byte (sign extend to VGPR destination).
-    void
-    Inst_FLAT__FLAT_LOAD_SBYTE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_LOAD_SBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_SBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_FLAT__FLAT_LOAD_USHORT class methods ---
-
-    Inst_FLAT__FLAT_LOAD_USHORT::Inst_FLAT__FLAT_LOAD_USHORT(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_ushort")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_USHORT
-
-    Inst_FLAT__FLAT_LOAD_USHORT::~Inst_FLAT__FLAT_LOAD_USHORT()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_USHORT
-
-    // --- description from .arch file ---
-    // Untyped buffer load unsigned short (zero extend to VGPR destination).
-    void
-    Inst_FLAT__FLAT_LOAD_USHORT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_LOAD_USHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<VecElemU16>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_USHORT::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU16*>(
-                    gpuDynInst->d_data))[lane]);
-            }
-        }
-        vdst.write();
-    } // execute
-
-    // --- Inst_FLAT__FLAT_LOAD_SSHORT class methods ---
-
-    Inst_FLAT__FLAT_LOAD_SSHORT::Inst_FLAT__FLAT_LOAD_SSHORT(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_sshort")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_SSHORT
-
-    Inst_FLAT__FLAT_LOAD_SSHORT::~Inst_FLAT__FLAT_LOAD_SSHORT()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_SSHORT
-
-    // --- description from .arch file ---
-    // Untyped buffer load signed short (sign extend to VGPR destination).
-    void
-    Inst_FLAT__FLAT_LOAD_SSHORT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_LOAD_SSHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_SSHORT::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_FLAT__FLAT_LOAD_DWORD class methods ---
-
-    Inst_FLAT__FLAT_LOAD_DWORD::Inst_FLAT__FLAT_LOAD_DWORD(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_DWORD
-
-    Inst_FLAT__FLAT_LOAD_DWORD::~Inst_FLAT__FLAT_LOAD_DWORD()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_DWORD
-
-    // --- description from .arch file ---
-    // Untyped buffer load dword.
-    void
-    Inst_FLAT__FLAT_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane];
-            }
-        }
-        vdst.write();
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_LOAD_DWORDX2 class methods ---
-
-    Inst_FLAT__FLAT_LOAD_DWORDX2::Inst_FLAT__FLAT_LOAD_DWORDX2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_DWORDX2
-
-    Inst_FLAT__FLAT_LOAD_DWORDX2::~Inst_FLAT__FLAT_LOAD_DWORDX2()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_DWORDX2
-
-    // --- description from .arch file ---
-    // Untyped buffer load 2 dwords.
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU64 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane];
-            }
-        }
-        vdst.write();
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_LOAD_DWORDX3 class methods ---
-
-    Inst_FLAT__FLAT_LOAD_DWORDX3::Inst_FLAT__FLAT_LOAD_DWORDX3(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_dwordx3")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_DWORDX3
-
-    Inst_FLAT__FLAT_LOAD_DWORDX3::~Inst_FLAT__FLAT_LOAD_DWORDX3()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_DWORDX3
-
-    // --- description from .arch file ---
-    // Untyped buffer load 3 dwords.
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<3>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
-        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 3];
-                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 3 + 1];
-                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 3 + 2];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-        vdst2.write();
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_LOAD_DWORDX4 class methods ---
-
-    Inst_FLAT__FLAT_LOAD_DWORDX4::Inst_FLAT__FLAT_LOAD_DWORDX4(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_DWORDX4
-
-    Inst_FLAT__FLAT_LOAD_DWORDX4::~Inst_FLAT__FLAT_LOAD_DWORDX4()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_DWORDX4
-
-    // --- description from .arch file ---
-    // Untyped buffer load 4 dwords.
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<4>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
-        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
-        VecOperandU32 vdst3(gpuDynInst, extData.VDST + 3);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4];
-                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 1];
-                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 2];
-                vdst3[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 3];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-        vdst2.write();
-        vdst3.write();
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_STORE_BYTE class methods ---
-
-    Inst_FLAT__FLAT_STORE_BYTE::Inst_FLAT__FLAT_STORE_BYTE(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_store_byte")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_FLAT__FLAT_STORE_BYTE
-
-    Inst_FLAT__FLAT_STORE_BYTE::~Inst_FLAT__FLAT_STORE_BYTE()
-    {
-    } // ~Inst_FLAT__FLAT_STORE_BYTE
-
-    // --- description from .arch file ---
-    // Untyped buffer store byte.
-    void
-    Inst_FLAT__FLAT_STORE_BYTE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU8 data(gpuDynInst, extData.DATA);
-
-        data.read();
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU8*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_STORE_BYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemU8>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_STORE_BYTE::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_FLAT__FLAT_STORE_SHORT class methods ---
-
-    Inst_FLAT__FLAT_STORE_SHORT::Inst_FLAT__FLAT_STORE_SHORT(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_store_short")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_FLAT__FLAT_STORE_SHORT
-
-    Inst_FLAT__FLAT_STORE_SHORT::~Inst_FLAT__FLAT_STORE_SHORT()
-    {
-    } // ~Inst_FLAT__FLAT_STORE_SHORT
-
-    // --- description from .arch file ---
-    // Untyped buffer store short.
-    void
-    Inst_FLAT__FLAT_STORE_SHORT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU16 data(gpuDynInst, extData.DATA);
-
-        data.read();
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU16*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_STORE_SHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemU16>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_STORE_SHORT::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_STORE_SHORT_D16_HI class methods ---
-
-    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::
-        Inst_FLAT__FLAT_STORE_SHORT_D16_HI(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_store_short_d16_hi")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_FLAT__FLAT_STORE_SHORT_D16_HI
-
-    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::~Inst_FLAT__FLAT_STORE_SHORT_D16_HI()
-    {
-    } // ~Inst_FLAT__FLAT_STORE_SHORT_D16_HI
-
-    // --- description from .arch file ---
-    // Untyped buffer store short.
-    void
-    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA);
-
-        data.read();
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU16*>(gpuDynInst->d_data))[lane]
-                    = (data[lane] >> 16);
-            }
-        }
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemU16>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_STORE_DWORD class methods ---
-
-    Inst_FLAT__FLAT_STORE_DWORD::Inst_FLAT__FLAT_STORE_DWORD(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_store_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_FLAT__FLAT_STORE_DWORD
-
-    Inst_FLAT__FLAT_STORE_DWORD::~Inst_FLAT__FLAT_STORE_DWORD()
-    {
-    } // ~Inst_FLAT__FLAT_STORE_DWORD
-
-    // --- description from .arch file ---
-    // Untyped buffer store dword.
-    void
-    Inst_FLAT__FLAT_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA);
-
-        data.read();
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_STORE_DWORDX2 class methods ---
-
-    Inst_FLAT__FLAT_STORE_DWORDX2::Inst_FLAT__FLAT_STORE_DWORDX2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_store_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_FLAT__FLAT_STORE_DWORDX2
-
-    Inst_FLAT__FLAT_STORE_DWORDX2::~Inst_FLAT__FLAT_STORE_DWORDX2()
-    {
-    } // ~Inst_FLAT__FLAT_STORE_DWORDX2
-
-    // --- description from .arch file ---
-    // Untyped buffer store 2 dwords.
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU64 data(gpuDynInst, extData.DATA);
-
-        data.read();
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU64*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_STORE_DWORDX3 class methods ---
-
-    Inst_FLAT__FLAT_STORE_DWORDX3::Inst_FLAT__FLAT_STORE_DWORDX3(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_store_dwordx3")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_FLAT__FLAT_STORE_DWORDX3
-
-    Inst_FLAT__FLAT_STORE_DWORDX3::~Inst_FLAT__FLAT_STORE_DWORDX3()
-    {
-    } // ~Inst_FLAT__FLAT_STORE_DWORDX3
-
-    // --- description from .arch file ---
-    // Untyped buffer store 3 dwords.
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 data0(gpuDynInst, extData.DATA);
-        ConstVecOperandU32 data1(gpuDynInst, extData.DATA + 1);
-        ConstVecOperandU32 data2(gpuDynInst, extData.DATA + 2);
-
-        data0.read();
-        data1.read();
-        data2.read();
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 3] = data0[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 3 + 1] = data1[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 3 + 2] = data2[lane];
-            }
-        }
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<3>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_STORE_DWORDX4 class methods ---
-
-    Inst_FLAT__FLAT_STORE_DWORDX4::Inst_FLAT__FLAT_STORE_DWORDX4(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_store_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_FLAT__FLAT_STORE_DWORDX4
-
-    Inst_FLAT__FLAT_STORE_DWORDX4::~Inst_FLAT__FLAT_STORE_DWORDX4()
-    {
-    } // ~Inst_FLAT__FLAT_STORE_DWORDX4
-
-    // --- description from .arch file ---
-    // Untyped buffer store 4 dwords.
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 data0(gpuDynInst, extData.DATA);
-        ConstVecOperandU32 data1(gpuDynInst, extData.DATA + 1);
-        ConstVecOperandU32 data2(gpuDynInst, extData.DATA + 2);
-        ConstVecOperandU32 data3(gpuDynInst, extData.DATA + 3);
-
-        data0.read();
-        data1.read();
-        data2.read();
-        data3.read();
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4] = data0[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 3] = data3[lane];
-            }
-        }
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<4>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_SWAP class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_SWAP::Inst_FLAT__FLAT_ATOMIC_SWAP(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_swap")
-    {
-        setFlag(AtomicExch);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SWAP
-
-    Inst_FLAT__FLAT_ATOMIC_SWAP::~Inst_FLAT__FLAT_ATOMIC_SWAP()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SWAP
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SWAP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SWAP::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SWAP::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
-    } // completeAcc
-
-    // --- Inst_FLAT__FLAT_ATOMIC_CMPSWAP class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP
-        ::Inst_FLAT__FLAT_ATOMIC_CMPSWAP(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_cmpswap")
-    {
-        setFlag(AtomicCAS);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_CMPSWAP
-
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::~Inst_FLAT__FLAT_ATOMIC_CMPSWAP()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_CMPSWAP
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // src = DATA[0];
-    // cmp = DATA[1];
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU32, VecElemU32, 1>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_ADD class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_ADD::Inst_FLAT__FLAT_ATOMIC_ADD(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_add")
-    {
-        setFlag(AtomicAdd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_ADD
-
-    Inst_FLAT__FLAT_ATOMIC_ADD::~Inst_FLAT__FLAT_ATOMIC_ADD()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_ADD
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_SUB class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_SUB::Inst_FLAT__FLAT_ATOMIC_SUB(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_sub")
-    {
-        setFlag(AtomicSub);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SUB
-
-    Inst_FLAT__FLAT_ATOMIC_SUB::~Inst_FLAT__FLAT_ATOMIC_SUB()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SUB
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SUB::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SUB::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_SMIN class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_SMIN::Inst_FLAT__FLAT_ATOMIC_SMIN(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_smin")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SMIN
-
-    Inst_FLAT__FLAT_ATOMIC_SMIN::~Inst_FLAT__FLAT_ATOMIC_SMIN()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SMIN
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMIN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandI32, VecElemI32>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMIN::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemI32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMIN::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandI32, VecElemI32>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_UMIN class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_UMIN::Inst_FLAT__FLAT_ATOMIC_UMIN(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_umin")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_UMIN
-
-    Inst_FLAT__FLAT_ATOMIC_UMIN::~Inst_FLAT__FLAT_ATOMIC_UMIN()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_UMIN
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMIN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMIN::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMIN::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_SMAX class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_SMAX::Inst_FLAT__FLAT_ATOMIC_SMAX(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_smax")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SMAX
-
-    Inst_FLAT__FLAT_ATOMIC_SMAX::~Inst_FLAT__FLAT_ATOMIC_SMAX()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SMAX
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMAX::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandI32, VecElemI32>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMAX::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemI32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMAX::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandI32, VecElemI32>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_UMAX class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_UMAX::Inst_FLAT__FLAT_ATOMIC_UMAX(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_umax")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_UMAX
-
-    Inst_FLAT__FLAT_ATOMIC_UMAX::~Inst_FLAT__FLAT_ATOMIC_UMAX()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_UMAX
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMAX::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMAX::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMAX::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_AND class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_AND::Inst_FLAT__FLAT_ATOMIC_AND(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_and")
-    {
-        setFlag(AtomicAnd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_AND
-
-    Inst_FLAT__FLAT_ATOMIC_AND::~Inst_FLAT__FLAT_ATOMIC_AND()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_AND
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_AND::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_AND::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_AND::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_OR class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_OR::Inst_FLAT__FLAT_ATOMIC_OR(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_or")
-    {
-        setFlag(AtomicOr);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_OR
-
-    Inst_FLAT__FLAT_ATOMIC_OR::~Inst_FLAT__FLAT_ATOMIC_OR()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_OR
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_OR::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_OR::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_OR::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
-    } // completeAcc
-
-    // --- Inst_FLAT__FLAT_ATOMIC_XOR class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_XOR::Inst_FLAT__FLAT_ATOMIC_XOR(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_xor")
-    {
-        setFlag(AtomicXor);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_XOR
-
-    Inst_FLAT__FLAT_ATOMIC_XOR::~Inst_FLAT__FLAT_ATOMIC_XOR()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_XOR
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_XOR::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_XOR::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_XOR::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_INC class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_INC::Inst_FLAT__FLAT_ATOMIC_INC(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_inc")
-    {
-        setFlag(AtomicInc);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_INC
-
-    Inst_FLAT__FLAT_ATOMIC_INC::~Inst_FLAT__FLAT_ATOMIC_INC()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_INC
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_INC::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_INC::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_DEC class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_DEC::Inst_FLAT__FLAT_ATOMIC_DEC(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_dec")
-    {
-        setFlag(AtomicDec);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_DEC
-
-    Inst_FLAT__FLAT_ATOMIC_DEC::~Inst_FLAT__FLAT_ATOMIC_DEC()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_DEC
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
-    // (unsigned compare); RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_DEC::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_DEC::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_SWAP_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::Inst_FLAT__FLAT_ATOMIC_SWAP_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_swap_x2")
-    {
-        setFlag(AtomicExch);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SWAP_X2
-
-    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::~Inst_FLAT__FLAT_ATOMIC_SWAP_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SWAP_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_cmpswap_x2")
-    {
-        setFlag(AtomicCAS);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2
-
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::~Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // src = DATA[0:1];
-    // cmp = DATA[2:3];
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU64, VecElemU64, 2>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_ADD_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_ADD_X2::Inst_FLAT__FLAT_ATOMIC_ADD_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_add_x2")
-    {
-        setFlag(AtomicAdd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_ADD_X2
-
-    Inst_FLAT__FLAT_ATOMIC_ADD_X2::~Inst_FLAT__FLAT_ATOMIC_ADD_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_ADD_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_SUB_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_SUB_X2::Inst_FLAT__FLAT_ATOMIC_SUB_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_sub_x2")
-    {
-        setFlag(AtomicSub);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SUB_X2
-
-    Inst_FLAT__FLAT_ATOMIC_SUB_X2::~Inst_FLAT__FLAT_ATOMIC_SUB_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SUB_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SUB_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SUB_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SUB_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_SMIN_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::Inst_FLAT__FLAT_ATOMIC_SMIN_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_smin_x2")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SMIN_X2
-
-    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::~Inst_FLAT__FLAT_ATOMIC_SMIN_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SMIN_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandI64, VecElemI64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemI64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandI64, VecElemI64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_UMIN_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::Inst_FLAT__FLAT_ATOMIC_UMIN_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_umin_x2")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_UMIN_X2
-
-    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::~Inst_FLAT__FLAT_ATOMIC_UMIN_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_UMIN_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_SMAX_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::Inst_FLAT__FLAT_ATOMIC_SMAX_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_smax_x2")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SMAX_X2
-
-    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::~Inst_FLAT__FLAT_ATOMIC_SMAX_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SMAX_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandI64, VecElemI64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemI64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandI64, VecElemI64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_UMAX_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::Inst_FLAT__FLAT_ATOMIC_UMAX_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_umax_x2")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_UMAX_X2
-
-    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::~Inst_FLAT__FLAT_ATOMIC_UMAX_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_UMAX_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_AND_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_AND_X2::Inst_FLAT__FLAT_ATOMIC_AND_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_and_x2")
-    {
-        setFlag(AtomicAnd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_AND_X2
-
-    Inst_FLAT__FLAT_ATOMIC_AND_X2::~Inst_FLAT__FLAT_ATOMIC_AND_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_AND_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_AND_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_AND_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_AND_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_OR_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_OR_X2::Inst_FLAT__FLAT_ATOMIC_OR_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_or_x2")
-    {
-        setFlag(AtomicOr);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_OR_X2
-
-    Inst_FLAT__FLAT_ATOMIC_OR_X2::~Inst_FLAT__FLAT_ATOMIC_OR_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_OR_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_OR_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_OR_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_OR_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_XOR_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_XOR_X2::Inst_FLAT__FLAT_ATOMIC_XOR_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_xor_x2")
-    {
-        setFlag(AtomicXor);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_XOR_X2
-
-    Inst_FLAT__FLAT_ATOMIC_XOR_X2::~Inst_FLAT__FLAT_ATOMIC_XOR_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_XOR_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_XOR_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_XOR_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_XOR_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_INC_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_INC_X2::Inst_FLAT__FLAT_ATOMIC_INC_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_inc_x2")
-    {
-        setFlag(AtomicInc);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_INC_X2
-
-    Inst_FLAT__FLAT_ATOMIC_INC_X2::~Inst_FLAT__FLAT_ATOMIC_INC_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_INC_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_INC_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_INC_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_INC_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_DEC_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_DEC_X2::Inst_FLAT__FLAT_ATOMIC_DEC_X2(
-        InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_dec_x2")
-    {
-        setFlag(AtomicDec);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_DEC_X2
-
-    Inst_FLAT__FLAT_ATOMIC_DEC_X2::~Inst_FLAT__FLAT_ATOMIC_DEC_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_DEC_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
-    // (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_DEC_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_DEC_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_DEC_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_ADD_F32 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_ADD_F32::Inst_FLAT__FLAT_ATOMIC_ADD_F32(
-        InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_add_f32")
-    {
-        setFlag(AtomicAdd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_ADD_F32
-
-    Inst_FLAT__FLAT_ATOMIC_ADD_F32::~Inst_FLAT__FLAT_ATOMIC_ADD_F32()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_ADD_F32
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandF32, VecElemF32>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD_F32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemF32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD_F32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandF32, VecElemF32>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16::Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16(
-        InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_pk_add_f16")
-    {
-        setFlag(AtomicAdd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16
-
-    Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16::~Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_ADD_F64 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_ADD_F64::Inst_FLAT__FLAT_ATOMIC_ADD_F64(
-        InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_add_f64")
-    {
-        setFlag(AtomicAdd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_ADD_F64
-
-    Inst_FLAT__FLAT_ATOMIC_ADD_F64::~Inst_FLAT__FLAT_ATOMIC_ADD_F64()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_ADD_F64
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandF64, VecElemF64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD_F64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemF64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD_F64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandF64, VecElemF64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_MIN_F64 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_MIN_F64::Inst_FLAT__FLAT_ATOMIC_MIN_F64(
-        InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_min_f64")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_MIN_F64
-
-    Inst_FLAT__FLAT_ATOMIC_MIN_F64::~Inst_FLAT__FLAT_ATOMIC_MIN_F64()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_MIN_F64
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_MIN_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandF64, VecElemF64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_MIN_F64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemF64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_MIN_F64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandF64, VecElemF64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_MAX_F64 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_MAX_F64::Inst_FLAT__FLAT_ATOMIC_MAX_F64(
-        InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_max_f64")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_MAX_F64
-
-    Inst_FLAT__FLAT_ATOMIC_MAX_F64::~Inst_FLAT__FLAT_ATOMIC_MAX_F64()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_MAX_F64
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_MAX_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        atomicExecute<ConstVecOperandF64, VecElemF64>(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_MAX_F64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemF64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_MAX_F64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        atomicComplete<VecOperandF64, VecElemF64>(gpuDynInst);
-    } // completeAcc
-    // --- Inst_VOP3P__V_PK_FMA_F32 class methods ---
-
-    Inst_VOP3P__V_PK_FMA_F32::Inst_VOP3P__V_PK_FMA_F32(InFmt_VOP3P *iFmt)
-        : Inst_VOP3P(iFmt, "v_pk_fma_f32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP3P__V_PK_FMA_F32
-
-    Inst_VOP3P__V_PK_FMA_F32::~Inst_VOP3P__V_PK_FMA_F32()
-    {
-    } // ~Inst_VOP3P__V_PK_FMA_F32
-
-    // D.f[63:32] = S0.f[63:32] * S1.f[63:32] + S2.f[63:32] . D.f[31:0] =
-    //     S0.f[31:0] * S1.f[31:0] + S2.f[31:0] .
-    void
-    Inst_VOP3P__V_PK_FMA_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        // This is a special case of packed instructions which operates on
-        // 64-bit inputs/outputs and not 32-bit. U64 is used here as float
-        // values cannot use bitwise operations. Consider the U64 to imply
-        // untyped 64-bits of data.
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
-        VecOperandU64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        int opsel = instData.OPSEL;
-        int opsel_hi = extData.OPSEL_HI | (instData.OPSEL_HI2 << 2);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                uint32_t s0l = (opsel & 1) ? bits(src0[lane], 63, 32)
-                                           : bits(src0[lane], 31, 0);
-                uint32_t s1l = (opsel & 2) ? bits(src1[lane], 63, 32)
-                                           : bits(src1[lane], 31, 0);
-                uint32_t s2l = (opsel & 4) ? bits(src2[lane], 63, 32)
-                                           : bits(src2[lane], 31, 0);
-
-                float dword1 = std::fma(*reinterpret_cast<float*>(&s0l),
-                                        *reinterpret_cast<float*>(&s1l),
-                                        *reinterpret_cast<float*>(&s2l));
-
-                uint32_t s0h = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
-                                              : bits(src0[lane], 31, 0);
-                uint32_t s1h = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
-                                              : bits(src1[lane], 31, 0);
-                uint32_t s2h = (opsel_hi & 4) ? bits(src2[lane], 63, 32)
-                                              : bits(src2[lane], 31, 0);
-
-                float dword2 = std::fma(*reinterpret_cast<float*>(&s0h),
-                                        *reinterpret_cast<float*>(&s1h),
-                                        *reinterpret_cast<float*>(&s2h));
-
-                uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
-                uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
-
-                vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3P__V_PK_MUL_F32 class methods ---
-
-    Inst_VOP3P__V_PK_MUL_F32::Inst_VOP3P__V_PK_MUL_F32(InFmt_VOP3P *iFmt)
-        : Inst_VOP3P(iFmt, "v_pk_mul_f32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP3P__V_PK_MUL_F32
-
-    Inst_VOP3P__V_PK_MUL_F32::~Inst_VOP3P__V_PK_MUL_F32()
-    {
-    } // ~Inst_VOP3P__V_PK_MUL_F32
-
-    // D.f[63:32] = S0.f[63:32] * S1.f[63:32] . D.f[31:0] = S0.f[31:0] *
-    //              S1.f[31:0]
-    void
-    Inst_VOP3P__V_PK_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        // This is a special case of packed instructions which operates on
-        // 64-bit inputs/outputs and not 32-bit. U64 is used here as float
-        // values cannot use bitwise operations. Consider the U64 to imply
-        // untyped 64-bits of data.
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        VecOperandU64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        int opsel = instData.OPSEL;
-        int opsel_hi = extData.OPSEL_HI;
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
-                                                   : bits(src0[lane], 31, 0);
-                uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
-                                                   : bits(src1[lane], 31, 0);
-
-                float dword1 = *reinterpret_cast<float*>(&lower_dword)
-                             * *reinterpret_cast<float*>(&upper_dword);
-
-                lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
-                                             : bits(src0[lane], 31, 0);
-                upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
-                                             : bits(src1[lane], 31, 0);
-
-                float dword2 = *reinterpret_cast<float*>(&lower_dword)
-                             * *reinterpret_cast<float*>(&upper_dword);
-
-                uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
-                uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
-
-                vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3P__V_PK_ADD_F32 class methods ---
-
-    Inst_VOP3P__V_PK_ADD_F32::Inst_VOP3P__V_PK_ADD_F32(InFmt_VOP3P *iFmt)
-        : Inst_VOP3P(iFmt, "v_pk_add_f32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP3P__V_PK_ADD_F32
-
-    Inst_VOP3P__V_PK_ADD_F32::~Inst_VOP3P__V_PK_ADD_F32()
-    {
-    } // ~Inst_VOP3P__V_PK_ADD_F32
-
-    // D.f[63:32] = S0.f[63:32] + S1.f[63:32] . D.f[31:0] = S0.f[31:0] +
-    //              S1.f[31:0]
-    void
-    Inst_VOP3P__V_PK_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        // This is a special case of packed instructions which operates on
-        // 64-bit inputs/outputs and not 32-bit. U64 is used here as float
-        // values cannot use bitwise operations. Consider the U64 to imply
-        // untyped 64-bits of data.
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        VecOperandU64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        int opsel = instData.OPSEL;
-        int opsel_hi = extData.OPSEL_HI;
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
-                                                   : bits(src0[lane], 31, 0);
-                uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
-                                                   : bits(src1[lane], 31, 0);
-
-                float dword1 = *reinterpret_cast<float*>(&lower_dword)
-                             + *reinterpret_cast<float*>(&upper_dword);
-
-                lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
-                                             : bits(src0[lane], 31, 0);
-                upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
-                                             : bits(src1[lane], 31, 0);
-
-                float dword2 = *reinterpret_cast<float*>(&lower_dword)
-                             + *reinterpret_cast<float*>(&upper_dword);
-
-                uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
-                uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
-
-                vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3P__V_PK_MOV_B32 class methods ---
-
-    Inst_VOP3P__V_PK_MOV_B32::Inst_VOP3P__V_PK_MOV_B32(InFmt_VOP3P *iFmt)
-        : Inst_VOP3P(iFmt, "v_pk_mov_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP3P__V_PK_MOV_B32
-
-    Inst_VOP3P__V_PK_MOV_B32::~Inst_VOP3P__V_PK_MOV_B32()
-    {
-    } // ~Inst_VOP3P__V_PK_MOV_B32
-
-    // D.u[63:32] = S1.u[31:0]; D.u[31:0] = S0.u[31:0].
-    void
-    Inst_VOP3P__V_PK_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        // This is a special case of packed instructions which operates on
-        // 64-bit inputs/outputs and not 32-bit.
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        VecOperandU64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        // Only OPSEL[1:0] are used
-        // OPSEL[0] 0/1: Lower dest dword = lower/upper dword of src0
-
-        int opsel = instData.OPSEL;
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                // OPSEL[1] 0/1: Lower dest dword = lower/upper dword of src1
-                uint64_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
-                                                   : bits(src0[lane], 31, 0);
-                uint64_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
-                                                   : bits(src1[lane], 31, 0);
-
-                vdst[lane] = upper_dword << 32 | lower_dword;
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8 class methods ---
-
-    Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8::
-        Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8(InFmt_VOP3P_MAI *iFmt)
-        : Inst_VOP3P_MAI(iFmt, "v_mfma_i32_16x16x16i8")
-    {
-        setFlag(ALU);
-    } // Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8
-
-    Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8::
-        ~Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8()
-    {
-    } // ~Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8
-
-    // D(16x16I32) = A(16x16I8) x B(16x16I8) + C(16x16I32), 1 Blocks, 8
-    // pass, srcA/srcB 1 archVgpr, srcC/D 4 accVGPR
-    void
-    Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        int acc_offset = 0;
-        if (instData.ACC_CD) {
-            warn("ACC_CD not yet implemented\n");
-        }
-
-        // int8 size allows for 4 elements per lane. At 16x16 this means 4
-        // lanes per column (A matrix) / (B matrix). This whole matrix fits
-        // in one VGPR. The C matrix with size int32 requires 4 VGPRs.
-        // Handle the C matrix by using a delta. This is set to 1 normally to
-        // move to the next VGPR (1 dword away) and 0 if the input is a scalar
-        // reg (e.g., a constant).
-        int delta = isVectorReg(extData.SRC2) ? 1 : 0;
-
-        // VecOperandI8 will read 8 bits and sign extend, so used U32 to read
-        // as "untyped" 32-bit values.
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandI32 src2a(gpuDynInst, extData.SRC2+acc_offset);
-        ConstVecOperandI32 src2b(gpuDynInst, extData.SRC2+acc_offset+1*delta);
-        ConstVecOperandI32 src2c(gpuDynInst, extData.SRC2+acc_offset+2*delta);
-        ConstVecOperandI32 src2d(gpuDynInst, extData.SRC2+acc_offset+3*delta);
-
-        VecOperandI32 vdsta(gpuDynInst, instData.VDST+acc_offset);
-        VecOperandI32 vdstb(gpuDynInst, instData.VDST+acc_offset+1);
-        VecOperandI32 vdstc(gpuDynInst, instData.VDST+acc_offset+2);
-        VecOperandI32 vdstd(gpuDynInst, instData.VDST+acc_offset+3);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2a.readSrc();
-        src2b.readSrc();
-        src2c.readSrc();
-        src2d.readSrc();
-
-        int32_t A[16][16];
-        for (int i = 0; i < 64; ++i) {
-            // src0[0:15] contains columns 1 - 4 packed for rows 0 - 15,
-            // src0[16:31] contains columns 5 - 8 packed for rows 0 - 15,
-            // src0[32:47] contains columns 9 - 12 packed for rows 0 - 15,
-            // src0[48:63] contains columns 13 - 16 packed for rows 0 - 15,
-            int row = i % 16;
-            int start_col = (i / 16) * 4;
-
-            A[row][start_col+0] = sext<8>(bits(src0[i], 7, 0));
-            A[row][start_col+1] = sext<8>(bits(src0[i], 15, 8));
-            A[row][start_col+2] = sext<8>(bits(src0[i], 23, 16));
-            A[row][start_col+3] = sext<8>(bits(src0[i], 31, 24));
-        }
-
-        int32_t B[16][16];
-        for (int i = 0; i < 64; ++i) {
-            // src1[0:15] contains rows 1 - 4 packed for columns 0 - 15
-            // src1[16:31] contains rows 5 - 8 packed for columns 0 - 15
-            // src1[32:47] contains rows 9 - 12 packed for columns 0 - 15
-            // src1[48:63] contains rows 13 - 16 packed for columns 0 - 15
-            int start_row = (i / 16) * 4;
-            int col = i % 16;
-
-            B[start_row+0][col] = sext<8>(bits(src1[i], 7, 0));
-            B[start_row+1][col] = sext<8>(bits(src1[i], 15, 8));
-            B[start_row+2][col] = sext<8>(bits(src1[i], 23, 16));
-            B[start_row+3][col] = sext<8>(bits(src1[i], 31, 24));
-        }
-
-        int32_t result[16][16];
-
-        // Load accumulation matrix C into result
-        for (int i = 0; i < 64; ++i) {
-            // src2a contains rows 0, 4, 8, 12
-            result[(i/16)*4][(i%16)] = src2a[i];
-            // src2b contains rows 1, 5, 9, 13
-            result[(i/16)*4+1][(i%16)] = src2b[i];
-            // src2c contains rows 2, 6, 10, 14
-            result[(i/16)*4+2][(i%16)] = src2c[i];
-            // src2d contains rows 3, 7, 11, 15
-            result[(i/16)*4+3][(i%16)] = src2d[i];
-        }
-
-        // Compute new result - This is (obviously) not optimized
-        for (int i = 0; i < 16; ++i) {
-            for (int j = 0; j < 16; ++j) {
-                for (int k = 0; k < 16; ++k) {
-                    result[i][j] += A[i][k] * B[k][j];
-                }
-            }
-        }
-
-        // Put result in dest VGPRs
-        for (int i = 0; i < 64; ++i) {
-            // vdsta contains rows 0, 4, 8, 12
-            vdsta[i] = result[(i/16)*4][(i%16)];
-            // vdstb contains rows 1, 5, 9, 13
-            vdstb[i] = result[(i/16)*4+1][(i%16)];
-            // vdstc contains rows 2, 6, 10, 14
-            vdstc[i] = result[(i/16)*4+2][(i%16)];
-            // vdstd contains rows 3, 7, 11, 15
-            vdstd[i] = result[(i/16)*4+3][(i%16)];
-        }
-
-        vdsta.write();
-        vdstb.write();
-        vdstc.write();
-        vdstd.write();
-    } // execute
-    // --- Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64 class methods ---
-
-    Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64::
-        Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64(InFmt_VOP3P_MAI *iFmt)
-        : Inst_VOP3P_MAI(iFmt, "v_mfma_f64_16x16x4f64")
-    {
-        setFlag(ALU);
-    } // Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64
-
-    Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64::
-        ~Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64()
-    {
-    } // ~Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64
-
-    // D(16x16F64) = A(16x4F64) x B(4x16F64) + C(16x16F64), 1 Blocks, 8
-    // pass, srcA/srcB 2 VGPR, srcC/D 8 VGPR
-    void
-    Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        int acc_offset = 0;
-        if (instData.ACC_CD) {
-            warn("ACC_CD not yet implemented\n");
-        }
-
-        // Handling of src2 is a bit tricky. The operator[] overload cannot
-        // be used for dword count > 2, and the dword count here is 8. Usually
-        // src2 is a VGPR/AccGPR, but it might also be constant. In order to
-        // use operator[] and handle constants, check for VGPR here and set
-        // a delta for each of the pairs of src2 GPRs.
-        int delta = isVectorReg(extData.SRC2) ? 2 : 0;
-
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF64 src2a(gpuDynInst, extData.SRC2+acc_offset);
-        ConstVecOperandF64 src2b(gpuDynInst, extData.SRC2+acc_offset+1*delta);
-        ConstVecOperandF64 src2c(gpuDynInst, extData.SRC2+acc_offset+2*delta);
-        ConstVecOperandF64 src2d(gpuDynInst, extData.SRC2+acc_offset+3*delta);
-
-        VecOperandF64 vdsta(gpuDynInst, instData.VDST+acc_offset);
-        VecOperandF64 vdstb(gpuDynInst, instData.VDST+acc_offset+2);
-        VecOperandF64 vdstc(gpuDynInst, instData.VDST+acc_offset+4);
-        VecOperandF64 vdstd(gpuDynInst, instData.VDST+acc_offset+6);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2a.readSrc();
-        src2b.readSrc();
-        src2c.readSrc();
-        src2d.readSrc();
-
-        double result[16][16];
-
-        // Load src2 into result. src2 is row major
-        for (int i = 0; i < 64; ++i) {
-            // src2a contains rows 0 - 3
-            result[(i/16)][(i%16)] = src2a[i];
-            // src2b contains rows 4 - 7
-            result[(i/16)+4][(i%16)] = src2b[i];
-            // src2c contains rows 8 - 11
-            result[(i/16)+8][(i%16)] = src2c[i];
-            // src2d contains rows 12 - 15
-            result[(i/16)+12][(i%16)] = src2d[i];
-        }
-
-        // Compute new result
-        for (int i = 0; i < 16; ++i) {
-            for (int j = 0; j < 16; ++j) {
-                for (int k = 0; k < 4; ++k) {
-                    // src0 is column major, src1 is row major
-                    int lane_A = 16*k + i;
-                    int lane_B = 16*k + j;
-                    result[i][j] += src0[lane_A] * src1[lane_B];
-                }
-            }
-        }
-
-        // Put result in dest VGPRs
-        for (int i = 0; i < 64; ++i) {
-            // vdsta contains rows 0 - 3
-            vdsta[i] = result[(i/16)][(i%16)];
-            // src2b contains rows 4 - 7
-            vdstb[i] = result[(i/16)+4][(i%16)];
-            // src2c contains rows 8 - 11
-            vdstc[i] = result[(i/16)+8][(i%16)];
-            // src2d contains rows 12 - 15
-            vdstd[i] = result[(i/16)+12][(i%16)];
-        }
-
-        vdsta.write();
-        vdstb.write();
-        vdstc.write();
-        vdstd.write();
-    } // execute
-} // namespace VegaISA
-} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/mimg.cc b/src/arch/amdgpu/vega/insts/mimg.cc
new file mode 100644
index 0000000000..29a37cca1d
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/mimg.cc
@@ -0,0 +1,2047 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_MIMG__IMAGE_LOAD class methods ---
+
+    Inst_MIMG__IMAGE_LOAD::Inst_MIMG__IMAGE_LOAD(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_load")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_LOAD
+
+    Inst_MIMG__IMAGE_LOAD::~Inst_MIMG__IMAGE_LOAD()
+    {
+    } // ~Inst_MIMG__IMAGE_LOAD
+
+    // --- description from .arch file ---
+    // Image memory load with format conversion specified in T#. No sampler.
+    void
+    Inst_MIMG__IMAGE_LOAD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_LOAD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_LOAD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_LOAD_MIP class methods ---
+
+    Inst_MIMG__IMAGE_LOAD_MIP::Inst_MIMG__IMAGE_LOAD_MIP(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_load_mip")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_LOAD_MIP
+
+    Inst_MIMG__IMAGE_LOAD_MIP::~Inst_MIMG__IMAGE_LOAD_MIP()
+    {
+    } // ~Inst_MIMG__IMAGE_LOAD_MIP
+
+    // --- description from .arch file ---
+    // Image memory load with user-supplied mip level. No sampler.
+    void
+    Inst_MIMG__IMAGE_LOAD_MIP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_LOAD_MIP::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_LOAD_MIP::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_LOAD_PCK class methods ---
+
+    Inst_MIMG__IMAGE_LOAD_PCK::Inst_MIMG__IMAGE_LOAD_PCK(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_load_pck")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_LOAD_PCK
+
+    Inst_MIMG__IMAGE_LOAD_PCK::~Inst_MIMG__IMAGE_LOAD_PCK()
+    {
+    } // ~Inst_MIMG__IMAGE_LOAD_PCK
+
+    // --- description from .arch file ---
+    // Image memory load with no format conversion. No sampler.
+    void
+    Inst_MIMG__IMAGE_LOAD_PCK::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_LOAD_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_LOAD_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_LOAD_PCK_SGN class methods ---
+
+    Inst_MIMG__IMAGE_LOAD_PCK_SGN::Inst_MIMG__IMAGE_LOAD_PCK_SGN(
+        InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_load_pck_sgn")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_LOAD_PCK_SGN
+
+    Inst_MIMG__IMAGE_LOAD_PCK_SGN::~Inst_MIMG__IMAGE_LOAD_PCK_SGN()
+    {
+    } // ~Inst_MIMG__IMAGE_LOAD_PCK_SGN
+
+    // --- description from .arch file ---
+    // Image memory load with with no format conversion and sign extension. No
+    // ---  sampler.
+    void
+    Inst_MIMG__IMAGE_LOAD_PCK_SGN::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_LOAD_PCK_SGN::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_LOAD_PCK_SGN::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_LOAD_MIP_PCK class methods ---
+
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK::Inst_MIMG__IMAGE_LOAD_MIP_PCK(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_load_mip_pck")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_LOAD_MIP_PCK
+
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK::~Inst_MIMG__IMAGE_LOAD_MIP_PCK()
+    {
+    } // ~Inst_MIMG__IMAGE_LOAD_MIP_PCK
+
+    // --- description from .arch file ---
+    // Image memory load with user-supplied mip level, no format conversion. No
+    // ---  sampler.
+    void
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN class methods ---
+
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN(
+        InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_load_mip_pck_sgn")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN
+
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::~Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN()
+    {
+    } // ~Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN
+
+    // --- description from .arch file ---
+    // Image memory load with user-supplied mip level, no format conversion and
+    // ---  with sign extension. No sampler.
+    void
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_STORE class methods ---
+
+    Inst_MIMG__IMAGE_STORE::Inst_MIMG__IMAGE_STORE(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_store")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_STORE
+
+    Inst_MIMG__IMAGE_STORE::~Inst_MIMG__IMAGE_STORE()
+    {
+    } // ~Inst_MIMG__IMAGE_STORE
+
+    // --- description from .arch file ---
+    // Image memory store with format conversion specified in T#. No sampler.
+    void
+    Inst_MIMG__IMAGE_STORE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_STORE::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_STORE::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_STORE_MIP class methods ---
+
+    Inst_MIMG__IMAGE_STORE_MIP::Inst_MIMG__IMAGE_STORE_MIP(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_store_mip")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_STORE_MIP
+
+    Inst_MIMG__IMAGE_STORE_MIP::~Inst_MIMG__IMAGE_STORE_MIP()
+    {
+    } // ~Inst_MIMG__IMAGE_STORE_MIP
+
+    // --- description from .arch file ---
+    // Image memory store with format conversion specified in T# to user
+    // specified mip level. No sampler.
+    void
+    Inst_MIMG__IMAGE_STORE_MIP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_STORE_MIP::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_STORE_MIP::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_STORE_PCK class methods ---
+
+    Inst_MIMG__IMAGE_STORE_PCK::Inst_MIMG__IMAGE_STORE_PCK(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_store_pck")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_STORE_PCK
+
+    Inst_MIMG__IMAGE_STORE_PCK::~Inst_MIMG__IMAGE_STORE_PCK()
+    {
+    } // ~Inst_MIMG__IMAGE_STORE_PCK
+
+    // --- description from .arch file ---
+    // Image memory store of packed data without format conversion. No sampler.
+    void
+    Inst_MIMG__IMAGE_STORE_PCK::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_STORE_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_STORE_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_STORE_MIP_PCK class methods ---
+
+    Inst_MIMG__IMAGE_STORE_MIP_PCK::Inst_MIMG__IMAGE_STORE_MIP_PCK(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_store_mip_pck")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_STORE_MIP_PCK
+
+    Inst_MIMG__IMAGE_STORE_MIP_PCK::~Inst_MIMG__IMAGE_STORE_MIP_PCK()
+    {
+    } // ~Inst_MIMG__IMAGE_STORE_MIP_PCK
+
+    // --- description from .arch file ---
+    // Image memory store of packed data without format conversion to
+    // user-supplied mip level. No sampler.
+    void
+    Inst_MIMG__IMAGE_STORE_MIP_PCK::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_STORE_MIP_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_STORE_MIP_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_GET_RESINFO class methods ---
+
+    Inst_MIMG__IMAGE_GET_RESINFO::Inst_MIMG__IMAGE_GET_RESINFO(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_get_resinfo")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GET_RESINFO
+
+    Inst_MIMG__IMAGE_GET_RESINFO::~Inst_MIMG__IMAGE_GET_RESINFO()
+    {
+    } // ~Inst_MIMG__IMAGE_GET_RESINFO
+
+    // --- description from .arch file ---
+    // return resource info for a given mip level specified in the address
+    // vgpr. No sampler. Returns 4 integer values into VGPRs 3-0:
+    // {num_mip_levels, depth, height, width}.
+    void
+    Inst_MIMG__IMAGE_GET_RESINFO::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_SWAP class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_SWAP::Inst_MIMG__IMAGE_ATOMIC_SWAP(
+        InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_swap")
+    {
+        setFlag(AtomicExch);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_SWAP
+
+    Inst_MIMG__IMAGE_ATOMIC_SWAP::~Inst_MIMG__IMAGE_ATOMIC_SWAP()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_SWAP
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_SWAP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_CMPSWAP class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_CMPSWAP::Inst_MIMG__IMAGE_ATOMIC_CMPSWAP(
+        InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_cmpswap")
+    {
+        setFlag(AtomicCAS);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_CMPSWAP
+
+    Inst_MIMG__IMAGE_ATOMIC_CMPSWAP::~Inst_MIMG__IMAGE_ATOMIC_CMPSWAP()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_CMPSWAP
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // src = DATA[0];
+    // cmp = DATA[1];
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_CMPSWAP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_ADD class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_ADD::Inst_MIMG__IMAGE_ATOMIC_ADD(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_add")
+    {
+        setFlag(AtomicAdd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_ADD
+
+    Inst_MIMG__IMAGE_ATOMIC_ADD::~Inst_MIMG__IMAGE_ATOMIC_ADD()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_ADD
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] += DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_ADD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_SUB class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_SUB::Inst_MIMG__IMAGE_ATOMIC_SUB(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_sub")
+    {
+        setFlag(AtomicSub);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_SUB
+
+    Inst_MIMG__IMAGE_ATOMIC_SUB::~Inst_MIMG__IMAGE_ATOMIC_SUB()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_SUB
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_SMIN class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_SMIN::Inst_MIMG__IMAGE_ATOMIC_SMIN(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_smin")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_SMIN
+
+    Inst_MIMG__IMAGE_ATOMIC_SMIN::~Inst_MIMG__IMAGE_ATOMIC_SMIN()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_SMIN
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_SMIN::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_UMIN class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_UMIN::Inst_MIMG__IMAGE_ATOMIC_UMIN(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_umin")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_UMIN
+
+    Inst_MIMG__IMAGE_ATOMIC_UMIN::~Inst_MIMG__IMAGE_ATOMIC_UMIN()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_UMIN
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_UMIN::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_SMAX class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_SMAX::Inst_MIMG__IMAGE_ATOMIC_SMAX(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_smax")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_SMAX
+
+    Inst_MIMG__IMAGE_ATOMIC_SMAX::~Inst_MIMG__IMAGE_ATOMIC_SMAX()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_SMAX
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_SMAX::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_UMAX class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_UMAX::Inst_MIMG__IMAGE_ATOMIC_UMAX(
+        InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_umax")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_UMAX
+
+    Inst_MIMG__IMAGE_ATOMIC_UMAX::~Inst_MIMG__IMAGE_ATOMIC_UMAX()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_UMAX
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_UMAX::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_AND class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_AND::Inst_MIMG__IMAGE_ATOMIC_AND(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_and")
+    {
+        setFlag(AtomicAnd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_AND
+
+    Inst_MIMG__IMAGE_ATOMIC_AND::~Inst_MIMG__IMAGE_ATOMIC_AND()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_AND
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] &= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_AND::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_OR class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_OR::Inst_MIMG__IMAGE_ATOMIC_OR(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_or")
+    {
+        setFlag(AtomicOr);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_OR
+
+    Inst_MIMG__IMAGE_ATOMIC_OR::~Inst_MIMG__IMAGE_ATOMIC_OR()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_OR
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] |= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_OR::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_XOR class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_XOR::Inst_MIMG__IMAGE_ATOMIC_XOR(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_xor")
+    {
+        setFlag(AtomicXor);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_XOR
+
+    Inst_MIMG__IMAGE_ATOMIC_XOR::~Inst_MIMG__IMAGE_ATOMIC_XOR()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_XOR
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] ^= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_XOR::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_INC class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_INC::Inst_MIMG__IMAGE_ATOMIC_INC(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_inc")
+    {
+        setFlag(AtomicInc);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_INC
+
+    Inst_MIMG__IMAGE_ATOMIC_INC::~Inst_MIMG__IMAGE_ATOMIC_INC()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_INC
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_DEC class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_DEC::Inst_MIMG__IMAGE_ATOMIC_DEC(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_dec")
+    {
+        setFlag(AtomicDec);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_DEC
+
+    Inst_MIMG__IMAGE_ATOMIC_DEC::~Inst_MIMG__IMAGE_ATOMIC_DEC()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_DEC
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
+    // (unsigned compare); RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE::Inst_MIMG__IMAGE_SAMPLE(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample")
+    {
+    } // Inst_MIMG__IMAGE_SAMPLE
+
+    Inst_MIMG__IMAGE_SAMPLE::~Inst_MIMG__IMAGE_SAMPLE()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE
+
+    // --- description from .arch file ---
+    // sample texture map.
+    void
+    Inst_MIMG__IMAGE_SAMPLE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_CL class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_CL::Inst_MIMG__IMAGE_SAMPLE_CL(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_CL
+
+    Inst_MIMG__IMAGE_SAMPLE_CL::~Inst_MIMG__IMAGE_SAMPLE_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_CL
+
+    // --- description from .arch file ---
+    // sample texture map, with LOD clamp specified in shader.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_D class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_D::Inst_MIMG__IMAGE_SAMPLE_D(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_d")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_D
+
+    Inst_MIMG__IMAGE_SAMPLE_D::~Inst_MIMG__IMAGE_SAMPLE_D()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_D
+
+    // --- description from .arch file ---
+    // sample texture map, with user derivatives
+    void
+    Inst_MIMG__IMAGE_SAMPLE_D::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_D_CL class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_D_CL::Inst_MIMG__IMAGE_SAMPLE_D_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_d_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_D_CL
+
+    Inst_MIMG__IMAGE_SAMPLE_D_CL::~Inst_MIMG__IMAGE_SAMPLE_D_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_D_CL
+
+    // --- description from .arch file ---
+    // sample texture map, with LOD clamp specified in shader, with user
+    // ---  derivatives.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_D_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_L class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_L::Inst_MIMG__IMAGE_SAMPLE_L(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_l")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_L
+
+    Inst_MIMG__IMAGE_SAMPLE_L::~Inst_MIMG__IMAGE_SAMPLE_L()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_L
+
+    // --- description from .arch file ---
+    // sample texture map, with user LOD.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_L::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_B class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_B::Inst_MIMG__IMAGE_SAMPLE_B(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_b")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_B
+
+    Inst_MIMG__IMAGE_SAMPLE_B::~Inst_MIMG__IMAGE_SAMPLE_B()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_B
+
+    // --- description from .arch file ---
+    // sample texture map, with lod bias.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_B::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_B_CL class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_B_CL::Inst_MIMG__IMAGE_SAMPLE_B_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_b_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_B_CL
+
+    Inst_MIMG__IMAGE_SAMPLE_B_CL::~Inst_MIMG__IMAGE_SAMPLE_B_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_B_CL
+
+    // --- description from .arch file ---
+    // sample texture map, with LOD clamp specified in shader, with lod bias.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_B_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_LZ class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_LZ::Inst_MIMG__IMAGE_SAMPLE_LZ(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_lz")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_LZ
+
+    Inst_MIMG__IMAGE_SAMPLE_LZ::~Inst_MIMG__IMAGE_SAMPLE_LZ()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_LZ
+
+    // --- description from .arch file ---
+    // sample texture map, from level 0.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_LZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C::Inst_MIMG__IMAGE_SAMPLE_C(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C
+
+    Inst_MIMG__IMAGE_SAMPLE_C::~Inst_MIMG__IMAGE_SAMPLE_C()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C
+
+    // --- description from .arch file ---
+    // sample texture map, with PCF.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_CL class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CL::Inst_MIMG__IMAGE_SAMPLE_C_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_CL
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CL::~Inst_MIMG__IMAGE_SAMPLE_C_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CL
+
+    // --- description from .arch file ---
+    // SAMPLE_C, with LOD clamp specified in shader.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_D class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_D::Inst_MIMG__IMAGE_SAMPLE_C_D(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_d")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_D
+
+    Inst_MIMG__IMAGE_SAMPLE_C_D::~Inst_MIMG__IMAGE_SAMPLE_C_D()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D
+
+    // --- description from .arch file ---
+    // SAMPLE_C, with user derivatives.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_D::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_D_CL class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_D_CL::Inst_MIMG__IMAGE_SAMPLE_C_D_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_d_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_D_CL
+
+    Inst_MIMG__IMAGE_SAMPLE_C_D_CL::~Inst_MIMG__IMAGE_SAMPLE_C_D_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D_CL
+
+    // --- description from .arch file ---
+    // SAMPLE_C, with LOD clamp specified in shader, with user derivatives.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_D_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_L class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_L::Inst_MIMG__IMAGE_SAMPLE_C_L(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_l")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_L
+
+    Inst_MIMG__IMAGE_SAMPLE_C_L::~Inst_MIMG__IMAGE_SAMPLE_C_L()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_L
+
+    // --- description from .arch file ---
+    // SAMPLE_C, with user LOD.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_L::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_B class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_B::Inst_MIMG__IMAGE_SAMPLE_C_B(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_b")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_B
+
+    Inst_MIMG__IMAGE_SAMPLE_C_B::~Inst_MIMG__IMAGE_SAMPLE_C_B()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B
+
+    // --- description from .arch file ---
+    // SAMPLE_C, with lod bias.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_B::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_B_CL class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_B_CL::Inst_MIMG__IMAGE_SAMPLE_C_B_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_b_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_B_CL
+
+    Inst_MIMG__IMAGE_SAMPLE_C_B_CL::~Inst_MIMG__IMAGE_SAMPLE_C_B_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B_CL
+
+    // --- description from .arch file ---
+    // SAMPLE_C, with LOD clamp specified in shader, with lod bias.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_B_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_LZ class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_LZ::Inst_MIMG__IMAGE_SAMPLE_C_LZ(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_lz")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_LZ
+
+    Inst_MIMG__IMAGE_SAMPLE_C_LZ::~Inst_MIMG__IMAGE_SAMPLE_C_LZ()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_LZ
+
+    // --- description from .arch file ---
+    // SAMPLE_C, from level 0.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_LZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_O::Inst_MIMG__IMAGE_SAMPLE_O(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_O
+
+    Inst_MIMG__IMAGE_SAMPLE_O::~Inst_MIMG__IMAGE_SAMPLE_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_O
+
+    // --- description from .arch file ---
+    // sample texture map, with user offsets.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_CL_O::Inst_MIMG__IMAGE_SAMPLE_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_CL_O
+
+    Inst_MIMG__IMAGE_SAMPLE_CL_O::~Inst_MIMG__IMAGE_SAMPLE_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_CL_O
+
+    // --- description from .arch file ---
+    // SAMPLE_O with LOD clamp specified in shader.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_D_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_D_O::Inst_MIMG__IMAGE_SAMPLE_D_O(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_d_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_D_O
+
+    Inst_MIMG__IMAGE_SAMPLE_D_O::~Inst_MIMG__IMAGE_SAMPLE_D_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_D_O
+
+    // --- description from .arch file ---
+    // SAMPLE_O, with user derivatives.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_D_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_D_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_D_CL_O::Inst_MIMG__IMAGE_SAMPLE_D_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_d_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_D_CL_O
+
+    Inst_MIMG__IMAGE_SAMPLE_D_CL_O::~Inst_MIMG__IMAGE_SAMPLE_D_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_D_CL_O
+
+    // --- description from .arch file ---
+    // SAMPLE_O, with LOD clamp specified in shader, with user derivatives.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_D_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_L_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_L_O::Inst_MIMG__IMAGE_SAMPLE_L_O(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_l_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_L_O
+
+    Inst_MIMG__IMAGE_SAMPLE_L_O::~Inst_MIMG__IMAGE_SAMPLE_L_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_L_O
+
+    // --- description from .arch file ---
+    // SAMPLE_O, with user LOD.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_L_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_B_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_B_O::Inst_MIMG__IMAGE_SAMPLE_B_O(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_b_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_B_O
+
+    Inst_MIMG__IMAGE_SAMPLE_B_O::~Inst_MIMG__IMAGE_SAMPLE_B_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_B_O
+
+    // --- description from .arch file ---
+    // SAMPLE_O, with lod bias.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_B_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_B_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_B_CL_O::Inst_MIMG__IMAGE_SAMPLE_B_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_b_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_B_CL_O
+
+    Inst_MIMG__IMAGE_SAMPLE_B_CL_O::~Inst_MIMG__IMAGE_SAMPLE_B_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_B_CL_O
+
+    // --- description from .arch file ---
+    // SAMPLE_O, with LOD clamp specified in shader, with lod bias.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_LZ_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_LZ_O::Inst_MIMG__IMAGE_SAMPLE_LZ_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_lz_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_LZ_O
+
+    Inst_MIMG__IMAGE_SAMPLE_LZ_O::~Inst_MIMG__IMAGE_SAMPLE_LZ_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_LZ_O
+
+    // --- description from .arch file ---
+    // SAMPLE_O, from level 0.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_LZ_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_O::Inst_MIMG__IMAGE_SAMPLE_C_O(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_O::~Inst_MIMG__IMAGE_SAMPLE_C_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C with user specified offsets.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_CL_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CL_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C_O, with LOD clamp specified in shader.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_D_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_D_O::Inst_MIMG__IMAGE_SAMPLE_C_D_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_d_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_D_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_D_O::~Inst_MIMG__IMAGE_SAMPLE_C_D_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C_O, with user derivatives.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_D_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_d_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C_O, with LOD clamp specified in shader, with user derivatives.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_L_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_L_O::Inst_MIMG__IMAGE_SAMPLE_C_L_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_l_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_L_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_L_O::~Inst_MIMG__IMAGE_SAMPLE_C_L_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_L_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C_O, with user LOD.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_L_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_B_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_B_O::Inst_MIMG__IMAGE_SAMPLE_C_B_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_b_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_B_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_B_O::~Inst_MIMG__IMAGE_SAMPLE_C_B_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C_O, with lod bias.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_B_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_b_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C_O, with LOD clamp specified in shader, with lod bias.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_LZ_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_LZ_O::Inst_MIMG__IMAGE_SAMPLE_C_LZ_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_lz_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_LZ_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_LZ_O::~Inst_MIMG__IMAGE_SAMPLE_C_LZ_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_LZ_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C_O, from level 0.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_LZ_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4 class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4::Inst_MIMG__IMAGE_GATHER4(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4
+
+    Inst_MIMG__IMAGE_GATHER4::~Inst_MIMG__IMAGE_GATHER4()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2).
+    void
+    Inst_MIMG__IMAGE_GATHER4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_CL class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_CL::Inst_MIMG__IMAGE_GATHER4_CL(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_CL
+
+    Inst_MIMG__IMAGE_GATHER4_CL::~Inst_MIMG__IMAGE_GATHER4_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_CL
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) with user LOD clamp.
+    void
+    Inst_MIMG__IMAGE_GATHER4_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_L class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_L::Inst_MIMG__IMAGE_GATHER4_L(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_l")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_L
+
+    Inst_MIMG__IMAGE_GATHER4_L::~Inst_MIMG__IMAGE_GATHER4_L()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_L
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) with user LOD.
+    void
+    Inst_MIMG__IMAGE_GATHER4_L::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_B class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_B::Inst_MIMG__IMAGE_GATHER4_B(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_b")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_B
+
+    Inst_MIMG__IMAGE_GATHER4_B::~Inst_MIMG__IMAGE_GATHER4_B()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_B
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) with user bias.
+    void
+    Inst_MIMG__IMAGE_GATHER4_B::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_B_CL class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_B_CL::Inst_MIMG__IMAGE_GATHER4_B_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_b_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_B_CL
+
+    Inst_MIMG__IMAGE_GATHER4_B_CL::~Inst_MIMG__IMAGE_GATHER4_B_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_B_CL
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) with user bias and clamp.
+    void
+    Inst_MIMG__IMAGE_GATHER4_B_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_LZ class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_LZ::Inst_MIMG__IMAGE_GATHER4_LZ(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_lz")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_LZ
+
+    Inst_MIMG__IMAGE_GATHER4_LZ::~Inst_MIMG__IMAGE_GATHER4_LZ()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_LZ
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) at level 0.
+    void
+    Inst_MIMG__IMAGE_GATHER4_LZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C::Inst_MIMG__IMAGE_GATHER4_C(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C
+
+    Inst_MIMG__IMAGE_GATHER4_C::~Inst_MIMG__IMAGE_GATHER4_C()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) with PCF.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_CL class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_CL::Inst_MIMG__IMAGE_GATHER4_C_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_CL
+
+    Inst_MIMG__IMAGE_GATHER4_C_CL::~Inst_MIMG__IMAGE_GATHER4_C_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_CL
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) with user LOD clamp and PCF.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_L class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_L::Inst_MIMG__IMAGE_GATHER4_C_L(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_l")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_L
+
+    Inst_MIMG__IMAGE_GATHER4_C_L::~Inst_MIMG__IMAGE_GATHER4_C_L()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_L
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) with user LOD and PCF.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_L::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_B class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_B::Inst_MIMG__IMAGE_GATHER4_C_B(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_b")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_B
+
+    Inst_MIMG__IMAGE_GATHER4_C_B::~Inst_MIMG__IMAGE_GATHER4_C_B()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_B
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) with user bias and PCF.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_B::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_B_CL class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_B_CL::Inst_MIMG__IMAGE_GATHER4_C_B_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_b_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_B_CL
+
+    Inst_MIMG__IMAGE_GATHER4_C_B_CL::~Inst_MIMG__IMAGE_GATHER4_C_B_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_B_CL
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) with user bias, clamp and PCF.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_B_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_LZ class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_LZ::Inst_MIMG__IMAGE_GATHER4_C_LZ(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_lz")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_LZ
+
+    Inst_MIMG__IMAGE_GATHER4_C_LZ::~Inst_MIMG__IMAGE_GATHER4_C_LZ()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_LZ
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) at level 0, with PCF.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_LZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_O::Inst_MIMG__IMAGE_GATHER4_O(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_O
+
+    Inst_MIMG__IMAGE_GATHER4_O::~Inst_MIMG__IMAGE_GATHER4_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_O
+
+    // --- description from .arch file ---
+    // GATHER4, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_CL_O::Inst_MIMG__IMAGE_GATHER4_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_CL_O
+
+    Inst_MIMG__IMAGE_GATHER4_CL_O::~Inst_MIMG__IMAGE_GATHER4_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_CL_O
+
+    // --- description from .arch file ---
+    // GATHER4_CL, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_L_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_L_O::Inst_MIMG__IMAGE_GATHER4_L_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_l_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_L_O
+
+    Inst_MIMG__IMAGE_GATHER4_L_O::~Inst_MIMG__IMAGE_GATHER4_L_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_L_O
+
+    // --- description from .arch file ---
+    // GATHER4_L, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_L_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_B_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_B_O::Inst_MIMG__IMAGE_GATHER4_B_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_b_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_B_O
+
+    Inst_MIMG__IMAGE_GATHER4_B_O::~Inst_MIMG__IMAGE_GATHER4_B_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_B_O
+
+    // --- description from .arch file ---
+    // GATHER4_B, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_B_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_B_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_B_CL_O::Inst_MIMG__IMAGE_GATHER4_B_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_b_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_B_CL_O
+
+    Inst_MIMG__IMAGE_GATHER4_B_CL_O::~Inst_MIMG__IMAGE_GATHER4_B_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_B_CL_O
+
+    // --- description from .arch file ---
+    // GATHER4_B_CL, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_LZ_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_LZ_O::Inst_MIMG__IMAGE_GATHER4_LZ_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_lz_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_LZ_O
+
+    Inst_MIMG__IMAGE_GATHER4_LZ_O::~Inst_MIMG__IMAGE_GATHER4_LZ_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_LZ_O
+
+    // --- description from .arch file ---
+    // GATHER4_LZ, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_LZ_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_O::Inst_MIMG__IMAGE_GATHER4_C_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_O
+
+    Inst_MIMG__IMAGE_GATHER4_C_O::~Inst_MIMG__IMAGE_GATHER4_C_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_O
+
+    // --- description from .arch file ---
+    // GATHER4_C, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_CL_O::Inst_MIMG__IMAGE_GATHER4_C_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_CL_O
+
+    Inst_MIMG__IMAGE_GATHER4_C_CL_O::~Inst_MIMG__IMAGE_GATHER4_C_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_CL_O
+
+    // --- description from .arch file ---
+    // GATHER4_C_CL, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_L_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_L_O::Inst_MIMG__IMAGE_GATHER4_C_L_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_l_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_L_O
+
+    Inst_MIMG__IMAGE_GATHER4_C_L_O::~Inst_MIMG__IMAGE_GATHER4_C_L_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_L_O
+
+    // --- description from .arch file ---
+    // GATHER4_C_L, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_L_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_B_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_B_O::Inst_MIMG__IMAGE_GATHER4_C_B_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_b_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_B_O
+
+    Inst_MIMG__IMAGE_GATHER4_C_B_O::~Inst_MIMG__IMAGE_GATHER4_C_B_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_B_O
+
+    // --- description from .arch file ---
+    // GATHER4_B, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_B_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_B_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_B_CL_O::Inst_MIMG__IMAGE_GATHER4_C_B_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_b_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_B_CL_O
+
+    Inst_MIMG__IMAGE_GATHER4_C_B_CL_O::~Inst_MIMG__IMAGE_GATHER4_C_B_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_B_CL_O
+
+    // --- description from .arch file ---
+    // GATHER4_B_CL, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_LZ_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_LZ_O::Inst_MIMG__IMAGE_GATHER4_C_LZ_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_lz_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_LZ_O
+
+    Inst_MIMG__IMAGE_GATHER4_C_LZ_O::~Inst_MIMG__IMAGE_GATHER4_C_LZ_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_LZ_O
+
+    // --- description from .arch file ---
+    // GATHER4_C_LZ, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_LZ_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GET_LOD class methods ---
+
+    Inst_MIMG__IMAGE_GET_LOD::Inst_MIMG__IMAGE_GET_LOD(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_get_lod")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GET_LOD
+
+    Inst_MIMG__IMAGE_GET_LOD::~Inst_MIMG__IMAGE_GET_LOD()
+    {
+    } // ~Inst_MIMG__IMAGE_GET_LOD
+
+    // --- description from .arch file ---
+    // Return calculated LOD. Vdata gets 2 32bit integer values: { rawLOD,
+    // ---  clampedLOD }.
+    void
+    Inst_MIMG__IMAGE_GET_LOD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_CD class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_CD::Inst_MIMG__IMAGE_SAMPLE_CD(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_cd")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_CD
+
+    Inst_MIMG__IMAGE_SAMPLE_CD::~Inst_MIMG__IMAGE_SAMPLE_CD()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_CD
+
+    // --- description from .arch file ---
+    // sample texture map, with user derivatives (LOD per quad)
+    void
+    Inst_MIMG__IMAGE_SAMPLE_CD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_CD_CL class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_CD_CL::Inst_MIMG__IMAGE_SAMPLE_CD_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_cd_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_CD_CL
+
+    Inst_MIMG__IMAGE_SAMPLE_CD_CL::~Inst_MIMG__IMAGE_SAMPLE_CD_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_CD_CL
+
+    // --- description from .arch file ---
+    // sample texture map, with LOD clamp specified in shader, with user
+    // ---  derivatives (LOD per quad).
+    void
+    Inst_MIMG__IMAGE_SAMPLE_CD_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_CD class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CD::Inst_MIMG__IMAGE_SAMPLE_C_CD(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_cd")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_CD
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CD::~Inst_MIMG__IMAGE_SAMPLE_C_CD()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD
+
+    // --- description from .arch file ---
+    // SAMPLE_C, with user derivatives (LOD per quad).
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_CD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_CD_CL class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL::Inst_MIMG__IMAGE_SAMPLE_C_CD_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_cd_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_CD_CL
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL::~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL
+
+    // --- description from .arch file ---
+    // SAMPLE_C, with LOD clamp specified in shader, with user derivatives
+    // (LOD per quad).
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_CD_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_CD_O::Inst_MIMG__IMAGE_SAMPLE_CD_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_cd_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_CD_O
+
+    Inst_MIMG__IMAGE_SAMPLE_CD_O::~Inst_MIMG__IMAGE_SAMPLE_CD_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_CD_O
+
+    // --- description from .arch file ---
+    // SAMPLE_O, with user derivatives (LOD per quad).
+    void
+    Inst_MIMG__IMAGE_SAMPLE_CD_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_CD_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_CD_CL_O::Inst_MIMG__IMAGE_SAMPLE_CD_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_cd_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_CD_CL_O
+
+    Inst_MIMG__IMAGE_SAMPLE_CD_CL_O::~Inst_MIMG__IMAGE_SAMPLE_CD_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_CD_CL_O
+
+    // --- description from .arch file ---
+    // SAMPLE_O, with LOD clamp specified in shader, with user derivatives
+    // (LOD per quad).
+    void
+    Inst_MIMG__IMAGE_SAMPLE_CD_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_CD_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CD_O::Inst_MIMG__IMAGE_SAMPLE_C_CD_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_cd_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_CD_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CD_O::~Inst_MIMG__IMAGE_SAMPLE_C_CD_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C_O, with user derivatives (LOD per quad).
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_CD_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_cd_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C_O, with LOD clamp specified in shader, with user derivatives
+    // (LOD per quad).
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/mtbuf.cc b/src/arch/amdgpu/vega/insts/mtbuf.cc
new file mode 100644
index 0000000000..2b37dfd6b9
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/mtbuf.cc
@@ -0,0 +1,584 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_X class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_X(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_x")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_X()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
+
+    // --- description from .arch file ---
+    // Typed buffer load 1 dword with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_xy")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
+
+    // --- description from .arch file ---
+    // Typed buffer load 2 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_xyz")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
+
+    // --- description from .arch file ---
+    // Typed buffer load 3 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_xyzw")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
+        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
+
+    // --- description from .arch file ---
+    // Typed buffer load 4 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_X class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_X
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_X(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_x")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_X
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::~Inst_MTBUF__TBUFFER_STORE_FORMAT_X()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_X
+
+    // --- description from .arch file ---
+    // Typed buffer store 1 dword with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_XY class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XY(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_xy")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XY()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
+
+    // --- description from .arch file ---
+    // Typed buffer store 2 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_xyz")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
+        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
+
+    // --- description from .arch file ---
+    // Typed buffer store 3 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_xyzw")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
+        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
+
+    // --- description from .arch file ---
+    // Typed buffer store 4 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_x")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::
+        ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
+
+    // --- description from .arch file ---
+    // Typed buffer load 1 dword with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::initiateAcc(
+          GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xy")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
+        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
+
+    // --- description from .arch file ---
+    // Typed buffer load 2 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ(
+          InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xyz")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
+        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
+
+    // --- description from .arch file ---
+    // Typed buffer load 3 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW(
+          InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xyzw")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
+        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
+
+    // --- description from .arch file ---
+    // Typed buffer load 4 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_x")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
+        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
+
+    // --- description from .arch file ---
+    // Typed buffer store 1 dword with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xy")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
+        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
+
+    // --- description from .arch file ---
+    // Typed buffer store 2 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xyz")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
+        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
+
+    // --- description from .arch file ---
+    // Typed buffer store 3 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::initiateAcc(
+          GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xyzw")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
+        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
+
+    // --- description from .arch file ---
+    // Typed buffer store 4 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::execute(
+        GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/mubuf.cc b/src/arch/amdgpu/vega/insts/mubuf.cc
new file mode 100644
index 0000000000..ff8bae2475
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/mubuf.cc
@@ -0,0 +1,2789 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_X class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_X
+        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_X(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_format_x")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_X
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::~Inst_MUBUF__BUFFER_LOAD_FORMAT_X()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_X
+
+    // --- description from .arch file ---
+    // Untyped buffer load 1 dword with format conversion.
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_XY class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY
+        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_XY(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_format_xy")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_XY
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::~Inst_MUBUF__BUFFER_LOAD_FORMAT_XY()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XY
+
+    // --- description from .arch file ---
+    // Untyped buffer load 2 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ
+        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_format_xyz")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ
+
+    // --- description from .arch file ---
+    // Untyped buffer load 3 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW
+        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_format_xyzw")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW
+
+    // --- description from .arch file ---
+    // Untyped buffer load 4 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_X class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_X
+        ::Inst_MUBUF__BUFFER_STORE_FORMAT_X(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_format_x")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_STORE_FORMAT_X
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_X::~Inst_MUBUF__BUFFER_STORE_FORMAT_X()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_X
+
+    // --- description from .arch file ---
+    // Untyped buffer store 1 dword with format conversion.
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_XY class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XY
+        ::Inst_MUBUF__BUFFER_STORE_FORMAT_XY(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_format_xy")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_STORE_FORMAT_XY
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::~Inst_MUBUF__BUFFER_STORE_FORMAT_XY()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_XY
+
+    // --- description from .arch file ---
+    // Untyped buffer store 2 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ
+        ::Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_format_xyz")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ
+
+    // --- description from .arch file ---
+    // Untyped buffer store 3 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
+        ::Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_format_xyzw")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
+        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
+
+    // --- description from .arch file ---
+    // Untyped buffer store 4 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
+        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_format_d16_x")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
+        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
+
+    // --- description from .arch file ---
+    // Untyped buffer load 1 dword with format conversion.
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
+        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_format_d16_xy")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
+        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
+
+    // --- description from .arch file ---
+    // Untyped buffer load 2 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
+        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_format_d16_xyz")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
+        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
+
+    // --- description from .arch file ---
+    // Untyped buffer load 3 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
+        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_format_d16_xyzw")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
+        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
+
+    // --- description from .arch file ---
+    // Untyped buffer load 4 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
+        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_format_d16_x")
+    {
+        setFlag(Store);
+    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
+        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
+
+    // --- description from .arch file ---
+    // Untyped buffer store 1 dword with format conversion.
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
+        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_format_d16_xy")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
+        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
+
+    // --- description from .arch file ---
+    // Untyped buffer store 2 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
+        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_format_d16_xyz")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
+        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
+
+    // --- description from .arch file ---
+    // Untyped buffer store 3 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
+        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_format_d16_xyzw")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
+        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
+
+    // --- description from .arch file ---
+    // Untyped buffer store 4 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_UBYTE class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_UBYTE
+        ::Inst_MUBUF__BUFFER_LOAD_UBYTE(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_ubyte")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_LOAD_UBYTE
+
+    Inst_MUBUF__BUFFER_LOAD_UBYTE::~Inst_MUBUF__BUFFER_LOAD_UBYTE()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_UBYTE
+
+    // --- description from .arch file ---
+    // Untyped buffer load unsigned byte (zero extend to VGPR destination).
+    void
+    Inst_MUBUF__BUFFER_LOAD_UBYTE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+
+        rsrcDesc.read();
+        offset.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_UBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<VecElemU8>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_UBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDATA);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                if (!oobMask[lane]) {
+                    vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU8*>(
+                        gpuDynInst->d_data))[lane]);
+                } else {
+                    vdst[lane] = 0;
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+
+    // --- Inst_MUBUF__BUFFER_LOAD_SBYTE class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_SBYTE
+        ::Inst_MUBUF__BUFFER_LOAD_SBYTE(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_sbyte")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_SBYTE
+
+    Inst_MUBUF__BUFFER_LOAD_SBYTE::~Inst_MUBUF__BUFFER_LOAD_SBYTE()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_SBYTE
+
+    // --- description from .arch file ---
+    // Untyped buffer load signed byte (sign extend to VGPR destination).
+    void
+    Inst_MUBUF__BUFFER_LOAD_SBYTE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_SBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_SBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_USHORT class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_USHORT
+        ::Inst_MUBUF__BUFFER_LOAD_USHORT(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_ushort")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_LOAD_USHORT
+
+    Inst_MUBUF__BUFFER_LOAD_USHORT::~Inst_MUBUF__BUFFER_LOAD_USHORT()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_USHORT
+
+    // --- description from .arch file ---
+    // Untyped buffer load unsigned short (zero extend to VGPR destination).
+    void
+    Inst_MUBUF__BUFFER_LOAD_USHORT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+
+        rsrcDesc.read();
+        offset.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_USHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<VecElemU16>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_USHORT::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDATA);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                if (!oobMask[lane]) {
+                    vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU16*>(
+                        gpuDynInst->d_data))[lane]);
+                } else {
+                    vdst[lane] = 0;
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+
+    // --- Inst_MUBUF__BUFFER_LOAD_SSHORT class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_SSHORT
+        ::Inst_MUBUF__BUFFER_LOAD_SSHORT(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_sshort")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_SSHORT
+
+    Inst_MUBUF__BUFFER_LOAD_SSHORT::~Inst_MUBUF__BUFFER_LOAD_SSHORT()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_SSHORT
+
+    // --- description from .arch file ---
+    // Untyped buffer load signed short (sign extend to VGPR destination).
+    void
+    Inst_MUBUF__BUFFER_LOAD_SSHORT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_SSHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_SSHORT::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_DWORD class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_DWORD
+        ::Inst_MUBUF__BUFFER_LOAD_DWORD(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_dword")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_LOAD_DWORD
+
+    Inst_MUBUF__BUFFER_LOAD_DWORD::~Inst_MUBUF__BUFFER_LOAD_DWORD()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_DWORD
+
+    // --- description from .arch file ---
+    // Untyped buffer load dword.
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+
+        rsrcDesc.read();
+        offset.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDATA);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                if (!oobMask[lane]) {
+                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane];
+                } else {
+                    vdst[lane] = 0;
+                }
+            }
+        }
+
+        vdst.write();
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_LOAD_DWORDX2 class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_DWORDX2
+        ::Inst_MUBUF__BUFFER_LOAD_DWORDX2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_dwordx2")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_LOAD_DWORDX2
+
+    Inst_MUBUF__BUFFER_LOAD_DWORDX2::~Inst_MUBUF__BUFFER_LOAD_DWORDX2()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_DWORDX2
+
+    // --- description from .arch file ---
+    // Untyped buffer load 2 dwords.
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+
+        rsrcDesc.read();
+        offset.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<2>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                if (!oobMask[lane]) {
+                    vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane * 2];
+                    vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane * 2 + 1];
+                } else {
+                    vdst0[lane] = 0;
+                    vdst1[lane] = 0;
+                }
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_LOAD_DWORDX3 class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_DWORDX3
+        ::Inst_MUBUF__BUFFER_LOAD_DWORDX3(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_dwordx3")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_LOAD_DWORDX3
+
+    Inst_MUBUF__BUFFER_LOAD_DWORDX3::~Inst_MUBUF__BUFFER_LOAD_DWORDX3()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_DWORDX3
+
+    // --- description from .arch file ---
+    // Untyped buffer load 3 dwords.
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+
+        rsrcDesc.read();
+        offset.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<3>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
+        VecOperandU32 vdst2(gpuDynInst, extData.VDATA + 2);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                if (!oobMask[lane]) {
+                    vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane * 3];
+                    vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane * 3 + 1];
+                    vdst2[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane * 3 + 2];
+                } else {
+                    vdst0[lane] = 0;
+                    vdst1[lane] = 0;
+                    vdst2[lane] = 0;
+                }
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+        vdst2.write();
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_LOAD_DWORDX4 class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_DWORDX4
+        ::Inst_MUBUF__BUFFER_LOAD_DWORDX4(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_dwordx4")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_LOAD_DWORDX4
+
+    Inst_MUBUF__BUFFER_LOAD_DWORDX4::~Inst_MUBUF__BUFFER_LOAD_DWORDX4()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_DWORDX4
+
+    // --- description from .arch file ---
+    // Untyped buffer load 4 dwords.
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+
+        rsrcDesc.read();
+        offset.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<4>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
+        VecOperandU32 vdst2(gpuDynInst, extData.VDATA + 2);
+        VecOperandU32 vdst3(gpuDynInst, extData.VDATA + 3);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                if (!oobMask[lane]) {
+                    vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane * 4];
+                    vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane * 4 + 1];
+                    vdst2[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane * 4 + 2];
+                    vdst3[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane * 4 + 3];
+                } else {
+                    vdst0[lane] = 0;
+                    vdst1[lane] = 0;
+                    vdst2[lane] = 0;
+                    vdst3[lane] = 0;
+                }
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+        vdst2.write();
+        vdst3.write();
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_STORE_BYTE class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_BYTE
+        ::Inst_MUBUF__BUFFER_STORE_BYTE(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_byte")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_STORE_BYTE
+
+    Inst_MUBUF__BUFFER_STORE_BYTE::~Inst_MUBUF__BUFFER_STORE_BYTE()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_BYTE
+
+    // --- description from .arch file ---
+    // Untyped buffer store byte.
+    void
+    Inst_MUBUF__BUFFER_STORE_BYTE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+        ConstVecOperandI8 data(gpuDynInst, extData.VDATA);
+
+        rsrcDesc.read();
+        offset.read();
+        data.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+       gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemI8*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_BYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<VecElemI8>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_BYTE::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_SHORT class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_SHORT
+        ::Inst_MUBUF__BUFFER_STORE_SHORT(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_short")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_STORE_SHORT
+
+    Inst_MUBUF__BUFFER_STORE_SHORT::~Inst_MUBUF__BUFFER_STORE_SHORT()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_SHORT
+
+    // --- description from .arch file ---
+    // Untyped buffer store short.
+    void
+    Inst_MUBUF__BUFFER_STORE_SHORT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+        ConstVecOperandI16 data(gpuDynInst, extData.VDATA);
+
+        rsrcDesc.read();
+        offset.read();
+        data.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemI16*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_SHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<VecElemI16>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_SHORT::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_DWORD class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_DWORD::
+        Inst_MUBUF__BUFFER_STORE_DWORD(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_dword")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_STORE_DWORD
+
+    Inst_MUBUF__BUFFER_STORE_DWORD::~Inst_MUBUF__BUFFER_STORE_DWORD()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_DWORD
+
+    // --- description from .arch file ---
+    // Untyped buffer store dword.
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+        ConstVecOperandU32 data(gpuDynInst, extData.VDATA);
+
+        rsrcDesc.read();
+        offset.read();
+        data.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_STORE_DWORDX2 class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_DWORDX2
+        ::Inst_MUBUF__BUFFER_STORE_DWORDX2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_dwordx2")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_STORE_DWORDX2
+
+    Inst_MUBUF__BUFFER_STORE_DWORDX2::~Inst_MUBUF__BUFFER_STORE_DWORDX2()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_DWORDX2
+
+    // --- description from .arch file ---
+    // Untyped buffer store 2 dwords.
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+        ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
+        ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
+
+        rsrcDesc.read();
+        offset.read();
+        data0.read();
+        data1.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
+                    = data0[lane];
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
+                    = data1[lane];
+            }
+        }
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<2>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_STORE_DWORDX3 class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_DWORDX3
+        ::Inst_MUBUF__BUFFER_STORE_DWORDX3(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_dwordx3")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_STORE_DWORDX3
+
+    Inst_MUBUF__BUFFER_STORE_DWORDX3::~Inst_MUBUF__BUFFER_STORE_DWORDX3()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_DWORDX3
+
+    // --- description from .arch file ---
+    // Untyped buffer store 3 dwords.
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+        ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
+        ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
+        ConstVecOperandU32 data2(gpuDynInst, extData.VDATA + 2);
+
+        rsrcDesc.read();
+        offset.read();
+        data0.read();
+        data1.read();
+        data2.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
+                    = data0[lane];
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
+                    = data1[lane];
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 2]
+                    = data2[lane];
+            }
+        }
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<3>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_STORE_DWORDX4 class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_DWORDX4
+        ::Inst_MUBUF__BUFFER_STORE_DWORDX4(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_dwordx4")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_STORE_DWORDX4
+
+    Inst_MUBUF__BUFFER_STORE_DWORDX4::~Inst_MUBUF__BUFFER_STORE_DWORDX4()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_DWORDX4
+
+    // --- description from .arch file ---
+    // Untyped buffer store 4 dwords.
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+        ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
+        ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
+        ConstVecOperandU32 data2(gpuDynInst, extData.VDATA + 2);
+        ConstVecOperandU32 data3(gpuDynInst, extData.VDATA + 3);
+
+        rsrcDesc.read();
+        offset.read();
+        data0.read();
+        data1.read();
+        data2.read();
+        data3.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
+                    = data0[lane];
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
+                    = data1[lane];
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 2]
+                    = data2[lane];
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 3]
+                    = data3[lane];
+            }
+        }
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<4>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_STORE_LDS_DWORD class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_LDS_DWORD
+        ::Inst_MUBUF__BUFFER_STORE_LDS_DWORD(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_lds_dword")
+    {
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_STORE_LDS_DWORD
+
+    Inst_MUBUF__BUFFER_STORE_LDS_DWORD::~Inst_MUBUF__BUFFER_STORE_LDS_DWORD()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_LDS_DWORD
+
+    // --- description from .arch file ---
+    // Store one DWORD from LDS memory to system memory without utilizing
+    // VGPRs.
+    void
+    Inst_MUBUF__BUFFER_STORE_LDS_DWORD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_WBINVL1 class methods ---
+
+    Inst_MUBUF__BUFFER_WBINVL1::Inst_MUBUF__BUFFER_WBINVL1(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_wbinvl1")
+    {
+        setFlag(MemoryRef);
+        setFlag(GPUStaticInst::MemSync);
+        setFlag(GlobalSegment);
+        setFlag(MemSync);
+    } // Inst_MUBUF__BUFFER_WBINVL1
+
+    Inst_MUBUF__BUFFER_WBINVL1::~Inst_MUBUF__BUFFER_WBINVL1()
+    {
+    } // ~Inst_MUBUF__BUFFER_WBINVL1
+
+    // --- description from .arch file ---
+    // Write back and invalidate the shader L1.
+    // Always returns ACK to shader.
+    void
+    Inst_MUBUF__BUFFER_WBINVL1::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+            gpuDynInst->computeUnit()->globalMemoryPipe.
+                issueRequest(gpuDynInst);
+        } else {
+            fatal("Unsupported scope for flat instruction.\n");
+        }
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_WBINVL1::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        // TODO: Fix it for gfx10. Once we have the new gfx10 cache model, we
+        // need to precisely communicate the writeback-invalidate operation to
+        // the new gfx10 coalescer rather than sending AcquireRelease markers.
+        // The SICoalescer would need to be updated appropriately as well.
+        injectGlobalMemFence(gpuDynInst);
+    } // initiateAcc
+    void
+    Inst_MUBUF__BUFFER_WBINVL1::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_WBINVL1_VOL class methods ---
+
+    Inst_MUBUF__BUFFER_WBINVL1_VOL
+        ::Inst_MUBUF__BUFFER_WBINVL1_VOL(InFmt_MUBUF*iFmt)
+        : Inst_MUBUF(iFmt, "buffer_wbinvl1_vol") {
+        // This instruction is same as buffer_wbinvl1 instruction except this
+        // instruction only invalidate L1 shader line with MTYPE SC and GC.
+        // Since Hermes L1 (TCP) do not differentiate between its cache lines,
+        // this instruction currently behaves (and implemented ) exactly like
+        // buffer_wbinvl1 instruction.
+        setFlag(MemoryRef);
+        setFlag(GPUStaticInst::MemSync);
+        setFlag(GlobalSegment);
+        setFlag(MemSync);
+    } // Inst_MUBUF__BUFFER_WBINVL1_VOL
+
+    Inst_MUBUF__BUFFER_WBINVL1_VOL::~Inst_MUBUF__BUFFER_WBINVL1_VOL()
+    {
+    } // ~Inst_MUBUF__BUFFER_WBINVL1_VOL
+
+    // --- description from .arch file ---
+    // Write back and invalidate the shader L1 only for lines that are marked
+    // ---  volatile.
+    // Always returns ACK to shader.
+    void
+    Inst_MUBUF__BUFFER_WBINVL1_VOL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+            gpuDynInst->computeUnit()->globalMemoryPipe.
+                issueRequest(gpuDynInst);
+        } else {
+            fatal("Unsupported scope for flat instruction.\n");
+        }
+    } // execute
+    void
+    Inst_MUBUF__BUFFER_WBINVL1_VOL::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        injectGlobalMemFence(gpuDynInst);
+    } // initiateAcc
+    void
+    Inst_MUBUF__BUFFER_WBINVL1_VOL::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_ATOMIC_SWAP class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_SWAP
+        ::Inst_MUBUF__BUFFER_ATOMIC_SWAP(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_swap")
+    {
+        setFlag(AtomicExch);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_SWAP
+
+    Inst_MUBUF__BUFFER_ATOMIC_SWAP::~Inst_MUBUF__BUFFER_ATOMIC_SWAP()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_SWAP
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_SWAP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP
+        ::Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_cmpswap")
+    {
+        setFlag(AtomicCAS);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP
+
+    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // src = DATA[0];
+    // cmp = DATA[1];
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+        ConstVecOperandU32 src(gpuDynInst, extData.VDATA);
+        ConstVecOperandU32 cmp(gpuDynInst, extData.VDATA + 1);
+
+        rsrcDesc.read();
+        offset.read();
+        src.read();
+        cmp.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->x_data))[lane]
+                    = src[lane];
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
+                    = cmp[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        if (isAtomicRet()) {
+            VecOperandU32 vdst(gpuDynInst, extData.VDATA);
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (gpuDynInst->exec_mask[lane]) {
+                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane];
+                }
+            }
+
+            vdst.write();
+        }
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_ATOMIC_ADD class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_ADD
+        ::Inst_MUBUF__BUFFER_ATOMIC_ADD(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_add")
+    {
+        setFlag(AtomicAdd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_ADD
+
+    Inst_MUBUF__BUFFER_ATOMIC_ADD::~Inst_MUBUF__BUFFER_ATOMIC_ADD()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_ADD
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] += DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_ADD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_SUB class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_SUB
+        ::Inst_MUBUF__BUFFER_ATOMIC_SUB(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_sub")
+    {
+        setFlag(AtomicSub);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_SUB
+
+    Inst_MUBUF__BUFFER_ATOMIC_SUB::~Inst_MUBUF__BUFFER_ATOMIC_SUB()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_SUB
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_SMIN class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_SMIN
+        ::Inst_MUBUF__BUFFER_ATOMIC_SMIN(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_smin")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_SMIN
+
+    Inst_MUBUF__BUFFER_ATOMIC_SMIN::~Inst_MUBUF__BUFFER_ATOMIC_SMIN()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMIN
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_SMIN::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_UMIN class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_UMIN
+        ::Inst_MUBUF__BUFFER_ATOMIC_UMIN(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_umin")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_UMIN
+
+    Inst_MUBUF__BUFFER_ATOMIC_UMIN::~Inst_MUBUF__BUFFER_ATOMIC_UMIN()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMIN
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_UMIN::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_SMAX class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_SMAX
+        ::Inst_MUBUF__BUFFER_ATOMIC_SMAX(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_smax")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_SMAX
+
+    Inst_MUBUF__BUFFER_ATOMIC_SMAX::~Inst_MUBUF__BUFFER_ATOMIC_SMAX()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMAX
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_SMAX::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_UMAX class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_UMAX
+        ::Inst_MUBUF__BUFFER_ATOMIC_UMAX(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_umax")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_UMAX
+
+    Inst_MUBUF__BUFFER_ATOMIC_UMAX::~Inst_MUBUF__BUFFER_ATOMIC_UMAX()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMAX
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_UMAX::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_AND class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_AND
+        ::Inst_MUBUF__BUFFER_ATOMIC_AND(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_and")
+    {
+        setFlag(AtomicAnd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_AND
+
+    Inst_MUBUF__BUFFER_ATOMIC_AND::~Inst_MUBUF__BUFFER_ATOMIC_AND()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_AND
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] &= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_AND::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_OR class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_OR
+        ::Inst_MUBUF__BUFFER_ATOMIC_OR(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_or")
+    {
+        setFlag(AtomicOr);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_OR
+
+    Inst_MUBUF__BUFFER_ATOMIC_OR::~Inst_MUBUF__BUFFER_ATOMIC_OR()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_OR
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] |= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_OR::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_XOR class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_XOR
+        ::Inst_MUBUF__BUFFER_ATOMIC_XOR(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_xor")
+    {
+        setFlag(AtomicXor);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_XOR
+
+    Inst_MUBUF__BUFFER_ATOMIC_XOR::~Inst_MUBUF__BUFFER_ATOMIC_XOR()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_XOR
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] ^= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_XOR::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_INC class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_INC
+        ::Inst_MUBUF__BUFFER_ATOMIC_INC(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_inc")
+    {
+        setFlag(AtomicInc);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_INC
+
+    Inst_MUBUF__BUFFER_ATOMIC_INC::~Inst_MUBUF__BUFFER_ATOMIC_INC()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_INC
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_DEC class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_DEC
+        ::Inst_MUBUF__BUFFER_ATOMIC_DEC(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_dec")
+    {
+        setFlag(AtomicDec);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_DEC
+
+    Inst_MUBUF__BUFFER_ATOMIC_DEC::~Inst_MUBUF__BUFFER_ATOMIC_DEC()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_DEC
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
+    // (unsigned compare); RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_swap_x2")
+    {
+        setFlag(AtomicExch);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2::~Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_cmpswap_x2")
+    {
+        setFlag(AtomicCAS);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
+        ::~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // src = DATA[0:1];
+    // cmp = DATA[2:3];
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_ADD_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_ADD_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_ADD_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_add_x2")
+    {
+        setFlag(AtomicAdd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_ADD_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_ADD_X2::~Inst_MUBUF__BUFFER_ATOMIC_ADD_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_ADD_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] += DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_ADD_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_SUB_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_SUB_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_SUB_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_sub_x2")
+    {
+        setFlag(AtomicSub);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_SUB_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_SUB_X2::~Inst_MUBUF__BUFFER_ATOMIC_SUB_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_SUB_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_SUB_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_smin_x2")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2::~Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_umin_x2")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2::~Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_smax_x2")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2::~Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_umax_x2")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2::~Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_AND_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_AND_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_AND_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_and_x2")
+    {
+        setFlag(AtomicAnd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_AND_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_AND_X2::~Inst_MUBUF__BUFFER_ATOMIC_AND_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_AND_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] &= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_AND_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_OR_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_OR_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_OR_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_or_x2")
+    {
+        setFlag(AtomicOr);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+    } // Inst_MUBUF__BUFFER_ATOMIC_OR_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_OR_X2::~Inst_MUBUF__BUFFER_ATOMIC_OR_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_OR_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] |= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_OR_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_XOR_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_XOR_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_XOR_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_xor_x2")
+    {
+        setFlag(AtomicXor);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_XOR_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_XOR_X2::~Inst_MUBUF__BUFFER_ATOMIC_XOR_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_XOR_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] ^= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_XOR_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_INC_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_INC_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_INC_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_inc_x2")
+    {
+        setFlag(AtomicInc);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_INC_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_INC_X2::~Inst_MUBUF__BUFFER_ATOMIC_INC_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_INC_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_INC_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_DEC_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_DEC_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_DEC_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_dec_x2")
+    {
+        setFlag(AtomicDec);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_DEC_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_DEC_X2::~Inst_MUBUF__BUFFER_ATOMIC_DEC_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_DEC_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
+    // (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_DEC_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/smem.cc b/src/arch/amdgpu/vega/insts/smem.cc
new file mode 100644
index 0000000000..a6af4f007d
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/smem.cc
@@ -0,0 +1,1013 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_SMEM__S_LOAD_DWORD class methods ---
+
+    Inst_SMEM__S_LOAD_DWORD::Inst_SMEM__S_LOAD_DWORD(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_load_dword")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_LOAD_DWORD
+
+    Inst_SMEM__S_LOAD_DWORD::~Inst_SMEM__S_LOAD_DWORD()
+    {
+    } // ~Inst_SMEM__S_LOAD_DWORD
+
+    /**
+     * Read 1 dword from scalar data cache. If the offset is specified as an
+     * sgpr, the sgpr contains an unsigned byte offset (the 2 LSBs are
+     * ignored). If the offset is specified as an immediate 20-bit constant,
+     * the constant is an unsigned byte offset.
+     */
+    void
+    Inst_SMEM__S_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+
+        addr.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, addr, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe
+            .issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<1>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_LOAD_DWORDX2 class methods ---
+
+    Inst_SMEM__S_LOAD_DWORDX2::Inst_SMEM__S_LOAD_DWORDX2(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_load_dwordx2")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_LOAD_DWORDX2
+
+    Inst_SMEM__S_LOAD_DWORDX2::~Inst_SMEM__S_LOAD_DWORDX2()
+    {
+    } // ~Inst_SMEM__S_LOAD_DWORDX2
+
+    /**
+     * Read 2 dwords from scalar data cache. See s_load_dword for details on
+     * the offset input.
+     */
+    void
+    Inst_SMEM__S_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+
+        addr.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, addr, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe.
+            issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<2>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_LOAD_DWORDX4 class methods ---
+
+    Inst_SMEM__S_LOAD_DWORDX4::Inst_SMEM__S_LOAD_DWORDX4(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_load_dwordx4")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_LOAD_DWORDX4
+
+    Inst_SMEM__S_LOAD_DWORDX4::~Inst_SMEM__S_LOAD_DWORDX4()
+    {
+    } // ~Inst_SMEM__S_LOAD_DWORDX4
+
+    // --- description from .arch file ---
+    // Read 4 dwords from scalar data cache. See S_LOAD_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+
+        addr.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, addr, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe.
+            issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<4>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarOperandU128 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_LOAD_DWORDX8 class methods ---
+
+    Inst_SMEM__S_LOAD_DWORDX8::Inst_SMEM__S_LOAD_DWORDX8(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_load_dwordx8")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_LOAD_DWORDX8
+
+    Inst_SMEM__S_LOAD_DWORDX8::~Inst_SMEM__S_LOAD_DWORDX8()
+    {
+    } // ~Inst_SMEM__S_LOAD_DWORDX8
+
+    // --- description from .arch file ---
+    // Read 8 dwords from scalar data cache. See S_LOAD_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_LOAD_DWORDX8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+
+        addr.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, addr, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe.
+            issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_LOAD_DWORDX8::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<8>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_LOAD_DWORDX8::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarOperandU256 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_LOAD_DWORDX16 class methods ---
+
+    Inst_SMEM__S_LOAD_DWORDX16::Inst_SMEM__S_LOAD_DWORDX16(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_load_dwordx16")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_LOAD_DWORDX16
+
+    Inst_SMEM__S_LOAD_DWORDX16::~Inst_SMEM__S_LOAD_DWORDX16()
+    {
+    } // ~Inst_SMEM__S_LOAD_DWORDX16
+
+    // --- description from .arch file ---
+    // Read 16 dwords from scalar data cache. See S_LOAD_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_LOAD_DWORDX16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+
+        addr.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, addr, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe.
+            issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_LOAD_DWORDX16::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<16>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_LOAD_DWORDX16::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarOperandU512 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_BUFFER_LOAD_DWORD class methods ---
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORD::Inst_SMEM__S_BUFFER_LOAD_DWORD(
+          InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_buffer_load_dword")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_BUFFER_LOAD_DWORD
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORD::~Inst_SMEM__S_BUFFER_LOAD_DWORD()
+    {
+    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORD
+
+    // --- description from .arch file ---
+    // Read 1 dword from scalar data cache. See S_LOAD_DWORD for details on the
+    // ---  offset input.
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
+
+        rsrcDesc.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, rsrcDesc, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe
+            .issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<1>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        // 1 request, size 32
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_BUFFER_LOAD_DWORDX2 class methods ---
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::Inst_SMEM__S_BUFFER_LOAD_DWORDX2(
+          InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_buffer_load_dwordx2")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX2
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::~Inst_SMEM__S_BUFFER_LOAD_DWORDX2()
+    {
+    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX2
+
+    // --- description from .arch file ---
+    // Read 2 dwords from scalar data cache. See S_LOAD_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
+
+        rsrcDesc.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, rsrcDesc, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe
+            .issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<2>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        // use U64 because 2 requests, each size 32
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_BUFFER_LOAD_DWORDX4 class methods ---
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::Inst_SMEM__S_BUFFER_LOAD_DWORDX4(
+          InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_buffer_load_dwordx4")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX4
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::~Inst_SMEM__S_BUFFER_LOAD_DWORDX4()
+    {
+    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX4
+
+    // --- description from .arch file ---
+    // Read 4 dwords from scalar data cache. See S_LOAD_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
+
+        rsrcDesc.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, rsrcDesc, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe
+            .issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<4>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        // 4 requests, each size 32
+        ScalarOperandU128 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_BUFFER_LOAD_DWORDX8 class methods ---
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::Inst_SMEM__S_BUFFER_LOAD_DWORDX8(
+          InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_buffer_load_dwordx8")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX8
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::~Inst_SMEM__S_BUFFER_LOAD_DWORDX8()
+    {
+    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX8
+
+    // --- description from .arch file ---
+    // Read 8 dwords from scalar data cache. See S_LOAD_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
+
+        rsrcDesc.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, rsrcDesc, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe
+            .issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<8>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        // 8 requests, each size 32
+        ScalarOperandU256 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_BUFFER_LOAD_DWORDX16 class methods ---
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::Inst_SMEM__S_BUFFER_LOAD_DWORDX16(
+          InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_buffer_load_dwordx16")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX16
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::~Inst_SMEM__S_BUFFER_LOAD_DWORDX16()
+    {
+    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX16
+
+    // --- description from .arch file ---
+    // Read 16 dwords from scalar data cache. See S_LOAD_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
+
+        rsrcDesc.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, rsrcDesc, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe
+            .issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<16>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        // 16 requests, each size 32
+        ScalarOperandU512 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_STORE_DWORD class methods ---
+
+    Inst_SMEM__S_STORE_DWORD::Inst_SMEM__S_STORE_DWORD(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_store_dword")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_SMEM__S_STORE_DWORD
+
+    Inst_SMEM__S_STORE_DWORD::~Inst_SMEM__S_STORE_DWORD()
+    {
+    } // ~Inst_SMEM__S_STORE_DWORD
+
+    // --- description from .arch file ---
+    // Write 1 dword to scalar data cache.
+    // If the offset is specified as an SGPR, the SGPR contains an unsigned
+    // BYTE offset (the 2 LSBs are ignored).
+    // If the offset is specified as an immediate 20-bit constant, the
+    // constant is an unsigned BYTE offset.
+    void
+    Inst_SMEM__S_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+        ConstScalarOperandU32 sdata(gpuDynInst, instData.SDATA);
+
+        addr.read();
+        sdata.read();
+
+        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
+            sizeof(ScalarRegU32));
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, addr, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe.
+            issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<1>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_SMEM__S_STORE_DWORDX2 class methods ---
+
+    Inst_SMEM__S_STORE_DWORDX2::Inst_SMEM__S_STORE_DWORDX2(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_store_dwordx2")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_SMEM__S_STORE_DWORDX2
+
+    Inst_SMEM__S_STORE_DWORDX2::~Inst_SMEM__S_STORE_DWORDX2()
+    {
+    } // ~Inst_SMEM__S_STORE_DWORDX2
+
+    // --- description from .arch file ---
+    // Write 2 dwords to scalar data cache. See S_STORE_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+        ConstScalarOperandU64 sdata(gpuDynInst, instData.SDATA);
+
+        addr.read();
+        sdata.read();
+
+        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
+            sizeof(ScalarRegU64));
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, addr, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe.
+            issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<2>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_SMEM__S_STORE_DWORDX4 class methods ---
+
+    Inst_SMEM__S_STORE_DWORDX4::Inst_SMEM__S_STORE_DWORDX4(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_store_dwordx4")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_SMEM__S_STORE_DWORDX4
+
+    Inst_SMEM__S_STORE_DWORDX4::~Inst_SMEM__S_STORE_DWORDX4()
+    {
+    } // ~Inst_SMEM__S_STORE_DWORDX4
+
+    // --- description from .arch file ---
+    // Write 4 dwords to scalar data cache. See S_STORE_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+        ConstScalarOperandU64 sdata(gpuDynInst, instData.SDATA);
+
+        addr.read();
+        sdata.read();
+
+        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
+            sizeof(gpuDynInst->scalar_data));
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, addr, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe.
+            issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<4>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_SMEM__S_BUFFER_STORE_DWORD class methods ---
+
+    Inst_SMEM__S_BUFFER_STORE_DWORD::Inst_SMEM__S_BUFFER_STORE_DWORD(
+          InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_buffer_store_dword")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_SMEM__S_BUFFER_STORE_DWORD
+
+    Inst_SMEM__S_BUFFER_STORE_DWORD::~Inst_SMEM__S_BUFFER_STORE_DWORD()
+    {
+    } // ~Inst_SMEM__S_BUFFER_STORE_DWORD
+
+    // --- description from .arch file ---
+    // Write 1 dword to scalar data cache. See S_STORE_DWORD for details on the
+    // ---  offset input.
+    void
+    Inst_SMEM__S_BUFFER_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_SMEM__S_BUFFER_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_BUFFER_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_SMEM__S_BUFFER_STORE_DWORDX2 class methods ---
+
+    Inst_SMEM__S_BUFFER_STORE_DWORDX2::Inst_SMEM__S_BUFFER_STORE_DWORDX2(
+          InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_buffer_store_dwordx2")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_SMEM__S_BUFFER_STORE_DWORDX2
+
+    Inst_SMEM__S_BUFFER_STORE_DWORDX2::~Inst_SMEM__S_BUFFER_STORE_DWORDX2()
+    {
+    } // ~Inst_SMEM__S_BUFFER_STORE_DWORDX2
+
+    // --- description from .arch file ---
+    // Write 2 dwords to scalar data cache. See S_STORE_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_BUFFER_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_SMEM__S_BUFFER_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_BUFFER_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_SMEM__S_BUFFER_STORE_DWORDX4 class methods ---
+
+    Inst_SMEM__S_BUFFER_STORE_DWORDX4::Inst_SMEM__S_BUFFER_STORE_DWORDX4(
+          InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_buffer_store_dwordx4")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_SMEM__S_BUFFER_STORE_DWORDX4
+
+    Inst_SMEM__S_BUFFER_STORE_DWORDX4::~Inst_SMEM__S_BUFFER_STORE_DWORDX4()
+    {
+    } // ~Inst_SMEM__S_BUFFER_STORE_DWORDX4
+
+    // --- description from .arch file ---
+    // Write 4 dwords to scalar data cache. See S_STORE_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_BUFFER_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_SMEM__S_BUFFER_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_BUFFER_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_SMEM__S_DCACHE_INV class methods ---
+
+    Inst_SMEM__S_DCACHE_INV::Inst_SMEM__S_DCACHE_INV(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_dcache_inv")
+    {
+    } // Inst_SMEM__S_DCACHE_INV
+
+    Inst_SMEM__S_DCACHE_INV::~Inst_SMEM__S_DCACHE_INV()
+    {
+    } // ~Inst_SMEM__S_DCACHE_INV
+
+    // --- description from .arch file ---
+    // Invalidate the scalar data cache.
+    void
+    Inst_SMEM__S_DCACHE_INV::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SMEM__S_DCACHE_WB class methods ---
+
+    Inst_SMEM__S_DCACHE_WB::Inst_SMEM__S_DCACHE_WB(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_dcache_wb")
+    {
+    } // Inst_SMEM__S_DCACHE_WB
+
+    Inst_SMEM__S_DCACHE_WB::~Inst_SMEM__S_DCACHE_WB()
+    {
+    } // ~Inst_SMEM__S_DCACHE_WB
+
+    // --- description from .arch file ---
+    // Write back dirty data in the scalar data cache.
+    void
+    Inst_SMEM__S_DCACHE_WB::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SMEM__S_DCACHE_INV_VOL class methods ---
+
+    Inst_SMEM__S_DCACHE_INV_VOL::Inst_SMEM__S_DCACHE_INV_VOL(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_dcache_inv_vol")
+    {
+    } // Inst_SMEM__S_DCACHE_INV_VOL
+
+    Inst_SMEM__S_DCACHE_INV_VOL::~Inst_SMEM__S_DCACHE_INV_VOL()
+    {
+    } // ~Inst_SMEM__S_DCACHE_INV_VOL
+
+    // --- description from .arch file ---
+    // Invalidate the scalar data cache volatile lines.
+    void
+    Inst_SMEM__S_DCACHE_INV_VOL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SMEM__S_DCACHE_WB_VOL class methods ---
+
+    Inst_SMEM__S_DCACHE_WB_VOL::Inst_SMEM__S_DCACHE_WB_VOL(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_dcache_wb_vol")
+    {
+    } // Inst_SMEM__S_DCACHE_WB_VOL
+
+    Inst_SMEM__S_DCACHE_WB_VOL::~Inst_SMEM__S_DCACHE_WB_VOL()
+    {
+    } // ~Inst_SMEM__S_DCACHE_WB_VOL
+
+    // --- description from .arch file ---
+    // Write back dirty data in the scalar data cache volatile lines.
+    void
+    Inst_SMEM__S_DCACHE_WB_VOL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SMEM__S_MEMTIME class methods ---
+
+    Inst_SMEM__S_MEMTIME::Inst_SMEM__S_MEMTIME(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_memtime")
+    {
+        // s_memtime does not issue a memory request
+        setFlag(ALU);
+    } // Inst_SMEM__S_MEMTIME
+
+    Inst_SMEM__S_MEMTIME::~Inst_SMEM__S_MEMTIME()
+    {
+    } // ~Inst_SMEM__S_MEMTIME
+
+    // --- description from .arch file ---
+    // Return current 64-bit timestamp.
+    void
+    Inst_SMEM__S_MEMTIME::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDATA);
+        sdst = (ScalarRegU64)gpuDynInst->computeUnit()->curCycle();
+        sdst.write();
+    } // execute
+    // --- Inst_SMEM__S_MEMREALTIME class methods ---
+
+    Inst_SMEM__S_MEMREALTIME::Inst_SMEM__S_MEMREALTIME(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_memrealtime")
+    {
+    } // Inst_SMEM__S_MEMREALTIME
+
+    Inst_SMEM__S_MEMREALTIME::~Inst_SMEM__S_MEMREALTIME()
+    {
+    } // ~Inst_SMEM__S_MEMREALTIME
+
+    // --- description from .arch file ---
+    // Return current 64-bit RTC.
+    void
+    Inst_SMEM__S_MEMREALTIME::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SMEM__S_ATC_PROBE class methods ---
+
+    Inst_SMEM__S_ATC_PROBE::Inst_SMEM__S_ATC_PROBE(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_atc_probe")
+    {
+    } // Inst_SMEM__S_ATC_PROBE
+
+    Inst_SMEM__S_ATC_PROBE::~Inst_SMEM__S_ATC_PROBE()
+    {
+    } // ~Inst_SMEM__S_ATC_PROBE
+
+    // --- description from .arch file ---
+    // Probe or prefetch an address into the SQC data cache.
+    void
+    Inst_SMEM__S_ATC_PROBE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SMEM__S_ATC_PROBE_BUFFER class methods ---
+
+    Inst_SMEM__S_ATC_PROBE_BUFFER::Inst_SMEM__S_ATC_PROBE_BUFFER(
+          InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_atc_probe_buffer")
+    {
+    } // Inst_SMEM__S_ATC_PROBE_BUFFER
+
+    Inst_SMEM__S_ATC_PROBE_BUFFER::~Inst_SMEM__S_ATC_PROBE_BUFFER()
+    {
+    } // ~Inst_SMEM__S_ATC_PROBE_BUFFER
+
+    // --- description from .arch file ---
+    // Probe or prefetch an address into the SQC data cache.
+    void
+    Inst_SMEM__S_ATC_PROBE_BUFFER::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/sop1.cc b/src/arch/amdgpu/vega/insts/sop1.cc
new file mode 100644
index 0000000000..fa9a103e39
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/sop1.cc
@@ -0,0 +1,1504 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/inst_util.hh"
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_SOP1__S_MOV_B32 class methods ---
+
+    Inst_SOP1__S_MOV_B32::Inst_SOP1__S_MOV_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_mov_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_MOV_B32
+
+    Inst_SOP1__S_MOV_B32::~Inst_SOP1__S_MOV_B32()
+    {
+    } // ~Inst_SOP1__S_MOV_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u.
+    void
+    Inst_SOP1__S_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = src.rawData();
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_MOV_B64 class methods ---
+
+    Inst_SOP1__S_MOV_B64::Inst_SOP1__S_MOV_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_mov_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_MOV_B64
+
+    Inst_SOP1__S_MOV_B64::~Inst_SOP1__S_MOV_B64()
+    {
+    } // ~Inst_SOP1__S_MOV_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S0.u64.
+    void
+    Inst_SOP1__S_MOV_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = src.rawData();
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_CMOV_B32 class methods ---
+
+    Inst_SOP1__S_CMOV_B32::Inst_SOP1__S_CMOV_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_cmov_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_CMOV_B32
+
+    Inst_SOP1__S_CMOV_B32::~Inst_SOP1__S_CMOV_B32()
+    {
+    } // ~Inst_SOP1__S_CMOV_B32
+
+    // --- description from .arch file ---
+    // (SCC) then D.u = S0.u;
+    // else NOP.
+    // Conditional move.
+    void
+    Inst_SOP1__S_CMOV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+        scc.read();
+
+        if (scc.rawData()) {
+            sdst = src.rawData();
+            sdst.write();
+        }
+    } // execute
+    // --- Inst_SOP1__S_CMOV_B64 class methods ---
+
+    Inst_SOP1__S_CMOV_B64::Inst_SOP1__S_CMOV_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_cmov_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_CMOV_B64
+
+    Inst_SOP1__S_CMOV_B64::~Inst_SOP1__S_CMOV_B64()
+    {
+    } // ~Inst_SOP1__S_CMOV_B64
+
+    // --- description from .arch file ---
+    // if (SCC) then D.u64 = S0.u64;
+    // else NOP.
+    // Conditional move.
+    void
+    Inst_SOP1__S_CMOV_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+        scc.read();
+
+        if (scc.rawData()) {
+            sdst = src.rawData();
+            sdst.write();
+        }
+    } // execute
+    // --- Inst_SOP1__S_NOT_B32 class methods ---
+
+    Inst_SOP1__S_NOT_B32::Inst_SOP1__S_NOT_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_not_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_NOT_B32
+
+    Inst_SOP1__S_NOT_B32::~Inst_SOP1__S_NOT_B32()
+    {
+    } // ~Inst_SOP1__S_NOT_B32
+
+    // --- description from .arch file ---
+    // D.u = ~S0.u;
+    // SCC = 1 if result is non-zero.
+    // Bitwise negation.
+    void
+    Inst_SOP1__S_NOT_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = ~src.rawData();
+
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_NOT_B64 class methods ---
+
+    Inst_SOP1__S_NOT_B64::Inst_SOP1__S_NOT_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_not_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_NOT_B64
+
+    Inst_SOP1__S_NOT_B64::~Inst_SOP1__S_NOT_B64()
+    {
+    } // ~Inst_SOP1__S_NOT_B64
+
+    // --- description from .arch file ---
+    // D.u64 = ~S0.u64;
+    // SCC = 1 if result is non-zero.
+    // Bitwise negation.
+    void
+    Inst_SOP1__S_NOT_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = ~src.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_WQM_B32 class methods ---
+
+    Inst_SOP1__S_WQM_B32::Inst_SOP1__S_WQM_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_wqm_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_WQM_B32
+
+    Inst_SOP1__S_WQM_B32::~Inst_SOP1__S_WQM_B32()
+    {
+    } // ~Inst_SOP1__S_WQM_B32
+
+    // --- description from .arch file ---
+    // D[i] = (S0[(i & ~3):(i | 3)] != 0);
+    // Computes whole quad mode for an active/valid mask.
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP1__S_WQM_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wholeQuadMode(src.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_WQM_B64 class methods ---
+
+    Inst_SOP1__S_WQM_B64::Inst_SOP1__S_WQM_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_wqm_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_WQM_B64
+
+    Inst_SOP1__S_WQM_B64::~Inst_SOP1__S_WQM_B64()
+    {
+    } // ~Inst_SOP1__S_WQM_B64
+
+    // --- description from .arch file ---
+    // D[i] = (S0[(i & ~3):(i | 3)] != 0);
+    // Computes whole quad mode for an active/valid mask.
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP1__S_WQM_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wholeQuadMode(src.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_BREV_B32 class methods ---
+
+    Inst_SOP1__S_BREV_B32::Inst_SOP1__S_BREV_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_brev_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BREV_B32
+
+    Inst_SOP1__S_BREV_B32::~Inst_SOP1__S_BREV_B32()
+    {
+    } // ~Inst_SOP1__S_BREV_B32
+
+    // --- description from .arch file ---
+    // D.u[31:0] = S0.u[0:31] (reverse bits).
+    void
+    Inst_SOP1__S_BREV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = reverseBits(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_BREV_B64 class methods ---
+
+    Inst_SOP1__S_BREV_B64::Inst_SOP1__S_BREV_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_brev_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BREV_B64
+
+    Inst_SOP1__S_BREV_B64::~Inst_SOP1__S_BREV_B64()
+    {
+    } // ~Inst_SOP1__S_BREV_B64
+
+    // --- description from .arch file ---
+    // D.u64[63:0] = S0.u64[0:63] (reverse bits).
+    void
+    Inst_SOP1__S_BREV_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = reverseBits(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_BCNT0_I32_B32 class methods ---
+
+    Inst_SOP1__S_BCNT0_I32_B32::Inst_SOP1__S_BCNT0_I32_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_bcnt0_i32_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BCNT0_I32_B32
+
+    Inst_SOP1__S_BCNT0_I32_B32::~Inst_SOP1__S_BCNT0_I32_B32()
+    {
+    } // ~Inst_SOP1__S_BCNT0_I32_B32
+
+    // --- description from .arch file ---
+    // D.i = CountZeroBits(S0.u);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP1__S_BCNT0_I32_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = countZeroBits(src.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_BCNT0_I32_B64 class methods ---
+
+    Inst_SOP1__S_BCNT0_I32_B64::Inst_SOP1__S_BCNT0_I32_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_bcnt0_i32_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BCNT0_I32_B64
+
+    Inst_SOP1__S_BCNT0_I32_B64::~Inst_SOP1__S_BCNT0_I32_B64()
+    {
+    } // ~Inst_SOP1__S_BCNT0_I32_B64
+
+    // --- description from .arch file ---
+    // D.i = CountZeroBits(S0.u64);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP1__S_BCNT0_I32_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = countZeroBits(src.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_BCNT1_I32_B32 class methods ---
+
+    Inst_SOP1__S_BCNT1_I32_B32::Inst_SOP1__S_BCNT1_I32_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_bcnt1_i32_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BCNT1_I32_B32
+
+    Inst_SOP1__S_BCNT1_I32_B32::~Inst_SOP1__S_BCNT1_I32_B32()
+    {
+    } // ~Inst_SOP1__S_BCNT1_I32_B32
+
+    // --- description from .arch file ---
+    // D.i = CountOneBits(S0.u);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP1__S_BCNT1_I32_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = popCount(src.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_BCNT1_I32_B64 class methods ---
+
+    Inst_SOP1__S_BCNT1_I32_B64::Inst_SOP1__S_BCNT1_I32_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_bcnt1_i32_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BCNT1_I32_B64
+
+    Inst_SOP1__S_BCNT1_I32_B64::~Inst_SOP1__S_BCNT1_I32_B64()
+    {
+    } // ~Inst_SOP1__S_BCNT1_I32_B64
+
+    // --- description from .arch file ---
+    // D.i = CountOneBits(S0.u64);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP1__S_BCNT1_I32_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = popCount(src.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_FF0_I32_B32 class methods ---
+
+    Inst_SOP1__S_FF0_I32_B32::Inst_SOP1__S_FF0_I32_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_ff0_i32_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_FF0_I32_B32
+
+    Inst_SOP1__S_FF0_I32_B32::~Inst_SOP1__S_FF0_I32_B32()
+    {
+    } // ~Inst_SOP1__S_FF0_I32_B32
+
+    // --- description from .arch file ---
+    // D.i = FindFirstZero(S0.u);
+    // If no zeros are found, return -1.
+    // Returns the bit position of the first zero from the LSB.
+    void
+    Inst_SOP1__S_FF0_I32_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = findFirstZero(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_FF0_I32_B64 class methods ---
+
+    Inst_SOP1__S_FF0_I32_B64::Inst_SOP1__S_FF0_I32_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_ff0_i32_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_FF0_I32_B64
+
+    Inst_SOP1__S_FF0_I32_B64::~Inst_SOP1__S_FF0_I32_B64()
+    {
+    } // ~Inst_SOP1__S_FF0_I32_B64
+
+    // --- description from .arch file ---
+    // D.i = FindFirstZero(S0.u64);
+    // If no zeros are found, return -1.
+    // Returns the bit position of the first zero from the LSB.
+    void
+    Inst_SOP1__S_FF0_I32_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = findFirstZero(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_FF1_I32_B32 class methods ---
+
+    Inst_SOP1__S_FF1_I32_B32::Inst_SOP1__S_FF1_I32_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_ff1_i32_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_FF1_I32_B32
+
+    Inst_SOP1__S_FF1_I32_B32::~Inst_SOP1__S_FF1_I32_B32()
+    {
+    } // ~Inst_SOP1__S_FF1_I32_B32
+
+    // --- description from .arch file ---
+    // D.i = FindFirstOne(S0.u);
+    // If no ones are found, return -1.
+    // Returns the bit position of the first one from the LSB.
+    void
+    Inst_SOP1__S_FF1_I32_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = findFirstOne(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_FF1_I32_B64 class methods ---
+
+    Inst_SOP1__S_FF1_I32_B64::Inst_SOP1__S_FF1_I32_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_ff1_i32_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_FF1_I32_B64
+
+    Inst_SOP1__S_FF1_I32_B64::~Inst_SOP1__S_FF1_I32_B64()
+    {
+    } // ~Inst_SOP1__S_FF1_I32_B64
+
+    // --- description from .arch file ---
+    // D.i = FindFirstOne(S0.u64);
+    // If no ones are found, return -1.
+    // Returns the bit position of the first one from the LSB.
+    void
+    Inst_SOP1__S_FF1_I32_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = findFirstOne(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_FLBIT_I32_B32 class methods ---
+
+    Inst_SOP1__S_FLBIT_I32_B32::Inst_SOP1__S_FLBIT_I32_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_flbit_i32_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_FLBIT_I32_B32
+
+    Inst_SOP1__S_FLBIT_I32_B32::~Inst_SOP1__S_FLBIT_I32_B32()
+    {
+    } // ~Inst_SOP1__S_FLBIT_I32_B32
+
+    // --- description from .arch file ---
+    // D.i = FindFirstOne(S0.u);
+    // If no ones are found, return -1.
+    // Counts how many zeros before the first one starting from the MSB.
+    void
+    Inst_SOP1__S_FLBIT_I32_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = countZeroBitsMsb(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_FLBIT_I32_B64 class methods ---
+
+    Inst_SOP1__S_FLBIT_I32_B64::Inst_SOP1__S_FLBIT_I32_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_flbit_i32_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_FLBIT_I32_B64
+
+    Inst_SOP1__S_FLBIT_I32_B64::~Inst_SOP1__S_FLBIT_I32_B64()
+    {
+    } // ~Inst_SOP1__S_FLBIT_I32_B64
+
+    // --- description from .arch file ---
+    // D.i = FindFirstOne(S0.u64);
+    // If no ones are found, return -1.
+    // Counts how many zeros before the first one starting from the MSB.
+    void
+    Inst_SOP1__S_FLBIT_I32_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = countZeroBitsMsb(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_FLBIT_I32 class methods ---
+
+    Inst_SOP1__S_FLBIT_I32::Inst_SOP1__S_FLBIT_I32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_flbit_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_FLBIT_I32
+
+    Inst_SOP1__S_FLBIT_I32::~Inst_SOP1__S_FLBIT_I32()
+    {
+    } // ~Inst_SOP1__S_FLBIT_I32
+
+    // --- description from .arch file ---
+    // D.i = FirstOppositeSignBit(S0.i);
+    // If S0.i == 0 or S0.i == -1 (all bits are the same), return -1.
+    // Counts how many bits in a row (from MSB to LSB) are the same as the
+    // sign bit.
+    void
+    Inst_SOP1__S_FLBIT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = firstOppositeSignBit(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_FLBIT_I32_I64 class methods ---
+
+    Inst_SOP1__S_FLBIT_I32_I64::Inst_SOP1__S_FLBIT_I32_I64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_flbit_i32_i64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_FLBIT_I32_I64
+
+    Inst_SOP1__S_FLBIT_I32_I64::~Inst_SOP1__S_FLBIT_I32_I64()
+    {
+    } // ~Inst_SOP1__S_FLBIT_I32_I64
+
+    // --- description from .arch file ---
+    // D.i = FirstOppositeSignBit(S0.i64);
+    // If S0.i == 0 or S0.i == -1 (all bits are the same), return -1.
+    // Counts how many bits in a row (from MSB to LSB) are the same as the
+    // sign bit.
+    void
+    Inst_SOP1__S_FLBIT_I32_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = firstOppositeSignBit(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_SEXT_I32_I8 class methods ---
+
+    Inst_SOP1__S_SEXT_I32_I8::Inst_SOP1__S_SEXT_I32_I8(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_sext_i32_i8")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_SEXT_I32_I8
+
+    Inst_SOP1__S_SEXT_I32_I8::~Inst_SOP1__S_SEXT_I32_I8()
+    {
+    } // ~Inst_SOP1__S_SEXT_I32_I8
+
+    // --- description from .arch file ---
+    // D.i = signext(S0.i[7:0]) (sign extension).
+    void
+    Inst_SOP1__S_SEXT_I32_I8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = sext<std::numeric_limits<ScalarRegI8>::digits>(
+            bits(src.rawData(), 7, 0));
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_SEXT_I32_I16 class methods ---
+
+    Inst_SOP1__S_SEXT_I32_I16::Inst_SOP1__S_SEXT_I32_I16(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_sext_i32_i16")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_SEXT_I32_I16
+
+    Inst_SOP1__S_SEXT_I32_I16::~Inst_SOP1__S_SEXT_I32_I16()
+    {
+    } // ~Inst_SOP1__S_SEXT_I32_I16
+
+    // --- description from .arch file ---
+    // D.i = signext(S0.i[15:0]) (sign extension).
+    void
+    Inst_SOP1__S_SEXT_I32_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = sext<std::numeric_limits<ScalarRegI16>::digits>(
+            bits(src.rawData(), 15, 0));
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_BITSET0_B32 class methods ---
+
+    Inst_SOP1__S_BITSET0_B32::Inst_SOP1__S_BITSET0_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_bitset0_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BITSET0_B32
+
+    Inst_SOP1__S_BITSET0_B32::~Inst_SOP1__S_BITSET0_B32()
+    {
+    } // ~Inst_SOP1__S_BITSET0_B32
+
+    // --- description from .arch file ---
+    // D.u[S0.u[4:0]] = 0.
+    void
+    Inst_SOP1__S_BITSET0_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst.setBit(bits(src.rawData(), 4, 0), 0);
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_BITSET0_B64 class methods ---
+
+    Inst_SOP1__S_BITSET0_B64::Inst_SOP1__S_BITSET0_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_bitset0_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BITSET0_B64
+
+    Inst_SOP1__S_BITSET0_B64::~Inst_SOP1__S_BITSET0_B64()
+    {
+    } // ~Inst_SOP1__S_BITSET0_B64
+
+    // --- description from .arch file ---
+    // D.u64[S0.u[5:0]] = 0.
+    void
+    Inst_SOP1__S_BITSET0_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst.setBit(bits(src.rawData(), 5, 0), 0);
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_BITSET1_B32 class methods ---
+
+    Inst_SOP1__S_BITSET1_B32::Inst_SOP1__S_BITSET1_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_bitset1_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BITSET1_B32
+
+    Inst_SOP1__S_BITSET1_B32::~Inst_SOP1__S_BITSET1_B32()
+    {
+    } // ~Inst_SOP1__S_BITSET1_B32
+
+    // --- description from .arch file ---
+    // D.u[S0.u[4:0]] = 1.
+    void
+    Inst_SOP1__S_BITSET1_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst.setBit(bits(src.rawData(), 4, 0), 1);
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_BITSET1_B64 class methods ---
+
+    Inst_SOP1__S_BITSET1_B64::Inst_SOP1__S_BITSET1_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_bitset1_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BITSET1_B64
+
+    Inst_SOP1__S_BITSET1_B64::~Inst_SOP1__S_BITSET1_B64()
+    {
+    } // ~Inst_SOP1__S_BITSET1_B64
+
+    // --- description from .arch file ---
+    // D.u64[S0.u[5:0]] = 1.
+    void
+    Inst_SOP1__S_BITSET1_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst.setBit(bits(src.rawData(), 5, 0), 1);
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_GETPC_B64 class methods ---
+
+    Inst_SOP1__S_GETPC_B64::Inst_SOP1__S_GETPC_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_getpc_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_GETPC_B64
+
+    Inst_SOP1__S_GETPC_B64::~Inst_SOP1__S_GETPC_B64()
+    {
+    } // ~Inst_SOP1__S_GETPC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = PC + 4.
+    // Destination receives the byte address of the next instruction.
+    void
+    Inst_SOP1__S_GETPC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Addr pc = gpuDynInst->pc();
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+
+        sdst = pc + 4;
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_SETPC_B64 class methods ---
+
+    Inst_SOP1__S_SETPC_B64::Inst_SOP1__S_SETPC_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_setpc_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_SETPC_B64
+
+    Inst_SOP1__S_SETPC_B64::~Inst_SOP1__S_SETPC_B64()
+    {
+    } // ~Inst_SOP1__S_SETPC_B64
+
+    // --- description from .arch file ---
+    // PC = S0.u64.
+    // S0.u64 is a byte address of the instruction to jump to.
+    void
+    Inst_SOP1__S_SETPC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+
+        src.read();
+
+        wf->pc(src.rawData());
+    } // execute
+    // --- Inst_SOP1__S_SWAPPC_B64 class methods ---
+
+    Inst_SOP1__S_SWAPPC_B64::Inst_SOP1__S_SWAPPC_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_swappc_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_SWAPPC_B64
+
+    Inst_SOP1__S_SWAPPC_B64::~Inst_SOP1__S_SWAPPC_B64()
+    {
+    } // ~Inst_SOP1__S_SWAPPC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = PC + 4; PC = S0.u64.
+    // S0.u64 is a byte address of the instruction to jump to.
+    void
+    Inst_SOP1__S_SWAPPC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        Addr pc = gpuDynInst->pc();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = pc + 4;
+
+        wf->pc(src.rawData());
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_RFE_B64 class methods ---
+
+    Inst_SOP1__S_RFE_B64::Inst_SOP1__S_RFE_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_rfe_b64")
+    {
+    } // Inst_SOP1__S_RFE_B64
+
+    Inst_SOP1__S_RFE_B64::~Inst_SOP1__S_RFE_B64()
+    {
+    } // ~Inst_SOP1__S_RFE_B64
+
+    // --- description from .arch file ---
+    // PRIV = 0;
+    // PC = S0.u64.
+    // Return from exception handler and continue.
+    // This instruction may only be used within a trap handler.
+    void
+    Inst_SOP1__S_RFE_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOP1__S_AND_SAVEEXEC_B64 class methods ---
+
+    Inst_SOP1__S_AND_SAVEEXEC_B64::Inst_SOP1__S_AND_SAVEEXEC_B64(
+          InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_and_saveexec_b64")
+    {
+        setFlag(ALU);
+        setFlag(ReadsEXEC);
+        setFlag(WritesEXEC);
+    } // Inst_SOP1__S_AND_SAVEEXEC_B64
+
+    Inst_SOP1__S_AND_SAVEEXEC_B64::~Inst_SOP1__S_AND_SAVEEXEC_B64()
+    {
+    } // ~Inst_SOP1__S_AND_SAVEEXEC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = EXEC;
+    // EXEC = S0.u64 & EXEC;
+    // SCC = 1 if the new value of EXEC is non-zero.
+    void
+    Inst_SOP1__S_AND_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wf->execMask().to_ullong();
+        wf->execMask() = src.rawData() & wf->execMask().to_ullong();
+        scc = wf->execMask().any() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_OR_SAVEEXEC_B64 class methods ---
+
+    Inst_SOP1__S_OR_SAVEEXEC_B64::Inst_SOP1__S_OR_SAVEEXEC_B64(
+          InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_or_saveexec_b64")
+    {
+        setFlag(ALU);
+        setFlag(ReadsEXEC);
+        setFlag(WritesEXEC);
+    } // Inst_SOP1__S_OR_SAVEEXEC_B64
+
+    Inst_SOP1__S_OR_SAVEEXEC_B64::~Inst_SOP1__S_OR_SAVEEXEC_B64()
+    {
+    } // ~Inst_SOP1__S_OR_SAVEEXEC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = EXEC;
+    // EXEC = S0.u64 | EXEC;
+    // SCC = 1 if the new value of EXEC is non-zero.
+    void
+    Inst_SOP1__S_OR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wf->execMask().to_ullong();
+        wf->execMask() = src.rawData() | wf->execMask().to_ullong();
+        scc = wf->execMask().any() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_XOR_SAVEEXEC_B64 class methods ---
+
+    Inst_SOP1__S_XOR_SAVEEXEC_B64::Inst_SOP1__S_XOR_SAVEEXEC_B64(
+          InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_xor_saveexec_b64")
+    {
+        setFlag(ALU);
+        setFlag(ReadsEXEC);
+        setFlag(WritesEXEC);
+    } // Inst_SOP1__S_XOR_SAVEEXEC_B64
+
+    Inst_SOP1__S_XOR_SAVEEXEC_B64::~Inst_SOP1__S_XOR_SAVEEXEC_B64()
+    {
+    } // ~Inst_SOP1__S_XOR_SAVEEXEC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = EXEC;
+    // EXEC = S0.u64 ^ EXEC;
+    // SCC = 1 if the new value of EXEC is non-zero.
+    void
+    Inst_SOP1__S_XOR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wf->execMask().to_ullong();
+        wf->execMask() = src.rawData() ^ wf->execMask().to_ullong();
+        scc = wf->execMask().any() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_ANDN2_SAVEEXEC_B64 class methods ---
+
+    Inst_SOP1__S_ANDN2_SAVEEXEC_B64::Inst_SOP1__S_ANDN2_SAVEEXEC_B64(
+          InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_andn2_saveexec_b64")
+    {
+        setFlag(ALU);
+        setFlag(ReadsEXEC);
+        setFlag(WritesEXEC);
+    } // Inst_SOP1__S_ANDN2_SAVEEXEC_B64
+
+    Inst_SOP1__S_ANDN2_SAVEEXEC_B64::~Inst_SOP1__S_ANDN2_SAVEEXEC_B64()
+    {
+    } // ~Inst_SOP1__S_ANDN2_SAVEEXEC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = EXEC;
+    // EXEC = S0.u64 & ~EXEC;
+    // SCC = 1 if the new value of EXEC is non-zero.
+    void
+    Inst_SOP1__S_ANDN2_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wf->execMask().to_ullong();
+        wf->execMask() = src.rawData() &~ wf->execMask().to_ullong();
+        scc = wf->execMask().any() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_ORN2_SAVEEXEC_B64 class methods ---
+
+    Inst_SOP1__S_ORN2_SAVEEXEC_B64::Inst_SOP1__S_ORN2_SAVEEXEC_B64(
+          InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_orn2_saveexec_b64")
+    {
+        setFlag(ALU);
+        setFlag(ReadsEXEC);
+        setFlag(WritesEXEC);
+    } // Inst_SOP1__S_ORN2_SAVEEXEC_B64
+
+    Inst_SOP1__S_ORN2_SAVEEXEC_B64::~Inst_SOP1__S_ORN2_SAVEEXEC_B64()
+    {
+    } // ~Inst_SOP1__S_ORN2_SAVEEXEC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = EXEC;
+    // EXEC = S0.u64 | ~EXEC;
+    // SCC = 1 if the new value of EXEC is non-zero.
+    void
+    Inst_SOP1__S_ORN2_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wf->execMask().to_ullong();
+        wf->execMask() = src.rawData() |~ wf->execMask().to_ullong();
+        scc = wf->execMask().any() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_NAND_SAVEEXEC_B64 class methods ---
+
+    Inst_SOP1__S_NAND_SAVEEXEC_B64::Inst_SOP1__S_NAND_SAVEEXEC_B64(
+          InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_nand_saveexec_b64")
+    {
+        setFlag(ALU);
+        setFlag(ReadsEXEC);
+        setFlag(WritesEXEC);
+    } // Inst_SOP1__S_NAND_SAVEEXEC_B64
+
+    Inst_SOP1__S_NAND_SAVEEXEC_B64::~Inst_SOP1__S_NAND_SAVEEXEC_B64()
+    {
+    } // ~Inst_SOP1__S_NAND_SAVEEXEC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = EXEC;
+    // EXEC = ~(S0.u64 & EXEC);
+    // SCC = 1 if the new value of EXEC is non-zero.
+    void
+    Inst_SOP1__S_NAND_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wf->execMask().to_ullong();
+        wf->execMask() = ~(src.rawData() & wf->execMask().to_ullong());
+        scc = wf->execMask().any() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_NOR_SAVEEXEC_B64 class methods ---
+
+    Inst_SOP1__S_NOR_SAVEEXEC_B64::Inst_SOP1__S_NOR_SAVEEXEC_B64(
+          InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_nor_saveexec_b64")
+    {
+        setFlag(ALU);
+        setFlag(ReadsEXEC);
+        setFlag(WritesEXEC);
+    } // Inst_SOP1__S_NOR_SAVEEXEC_B64
+
+    Inst_SOP1__S_NOR_SAVEEXEC_B64::~Inst_SOP1__S_NOR_SAVEEXEC_B64()
+    {
+    } // ~Inst_SOP1__S_NOR_SAVEEXEC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = EXEC;
+    // EXEC = ~(S0.u64 | EXEC);
+    // SCC = 1 if the new value of EXEC is non-zero.
+    void
+    Inst_SOP1__S_NOR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wf->execMask().to_ullong();
+        wf->execMask() = ~(src.rawData() | wf->execMask().to_ullong());
+        scc = wf->execMask().any() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_XNOR_SAVEEXEC_B64 class methods ---
+
+    Inst_SOP1__S_XNOR_SAVEEXEC_B64::Inst_SOP1__S_XNOR_SAVEEXEC_B64(
+          InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_xnor_saveexec_b64")
+    {
+        setFlag(ALU);
+        setFlag(ReadsEXEC);
+        setFlag(WritesEXEC);
+    } // Inst_SOP1__S_XNOR_SAVEEXEC_B64
+
+    Inst_SOP1__S_XNOR_SAVEEXEC_B64::~Inst_SOP1__S_XNOR_SAVEEXEC_B64()
+    {
+    } // ~Inst_SOP1__S_XNOR_SAVEEXEC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = EXEC;
+    // EXEC = ~(S0.u64 ^ EXEC);
+    // SCC = 1 if the new value of EXEC is non-zero.
+    void
+    Inst_SOP1__S_XNOR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wf->execMask().to_ullong();
+        wf->execMask() = ~(src.rawData() ^ wf->execMask().to_ullong());
+        scc = wf->execMask().any() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_QUADMASK_B32 class methods ---
+
+    Inst_SOP1__S_QUADMASK_B32::Inst_SOP1__S_QUADMASK_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_quadmask_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_QUADMASK_B32
+
+    Inst_SOP1__S_QUADMASK_B32::~Inst_SOP1__S_QUADMASK_B32()
+    {
+    } // ~Inst_SOP1__S_QUADMASK_B32
+
+    // --- description from .arch file ---
+    // D.u = QuadMask(S0.u):
+    // D[0] = OR(S0[3:0]), D[1] = OR(S0[7:4]) ... D[31:8] = 0;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP1__S_QUADMASK_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = quadMask(src.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_QUADMASK_B64 class methods ---
+
+    Inst_SOP1__S_QUADMASK_B64::Inst_SOP1__S_QUADMASK_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_quadmask_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_QUADMASK_B64
+
+    Inst_SOP1__S_QUADMASK_B64::~Inst_SOP1__S_QUADMASK_B64()
+    {
+    } // ~Inst_SOP1__S_QUADMASK_B64
+
+    // --- description from .arch file ---
+    // D.u64 = QuadMask(S0.u64):
+    // D[0] = OR(S0[3:0]), D[1] = OR(S0[7:4]) ... D[63:16] = 0;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP1__S_QUADMASK_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = quadMask(src.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_MOVRELS_B32 class methods ---
+
+    Inst_SOP1__S_MOVRELS_B32::Inst_SOP1__S_MOVRELS_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_movrels_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_MOVRELS_B32
+
+    Inst_SOP1__S_MOVRELS_B32::~Inst_SOP1__S_MOVRELS_B32()
+    {
+    } // ~Inst_SOP1__S_MOVRELS_B32
+
+    // --- description from .arch file ---
+    // D.u = SGPR[S0.u + M0.u].u (move from relative source).
+    void
+    Inst_SOP1__S_MOVRELS_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
+        m0.read();
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0 + m0.rawData());
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = src.rawData();
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_MOVRELS_B64 class methods ---
+
+    Inst_SOP1__S_MOVRELS_B64::Inst_SOP1__S_MOVRELS_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_movrels_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_MOVRELS_B64
+
+    Inst_SOP1__S_MOVRELS_B64::~Inst_SOP1__S_MOVRELS_B64()
+    {
+    } // ~Inst_SOP1__S_MOVRELS_B64
+
+    // --- description from .arch file ---
+    // D.u64 = SGPR[S0.u + M0.u].u64 (move from relative source).
+    // The index in M0.u must be even for this operation.
+    void
+    Inst_SOP1__S_MOVRELS_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
+        m0.read();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0 + m0.rawData());
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = src.rawData();
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_MOVRELD_B32 class methods ---
+
+    Inst_SOP1__S_MOVRELD_B32::Inst_SOP1__S_MOVRELD_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_movreld_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_MOVRELD_B32
+
+    Inst_SOP1__S_MOVRELD_B32::~Inst_SOP1__S_MOVRELD_B32()
+    {
+    } // ~Inst_SOP1__S_MOVRELD_B32
+
+    // --- description from .arch file ---
+    // SGPR[D.u + M0.u].u = S0.u (move to relative destination).
+    void
+    Inst_SOP1__S_MOVRELD_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
+        m0.read();
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST + m0.rawData());
+
+        src.read();
+
+        sdst = src.rawData();
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_MOVRELD_B64 class methods ---
+
+    Inst_SOP1__S_MOVRELD_B64::Inst_SOP1__S_MOVRELD_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_movreld_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_MOVRELD_B64
+
+    Inst_SOP1__S_MOVRELD_B64::~Inst_SOP1__S_MOVRELD_B64()
+    {
+    } // ~Inst_SOP1__S_MOVRELD_B64
+
+    // --- description from .arch file ---
+    // SGPR[D.u + M0.u].u64 = S0.u64 (move to relative destination).
+    // The index in M0.u must be even for this operation.
+    void
+    Inst_SOP1__S_MOVRELD_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
+        m0.read();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST + m0.rawData());
+
+        src.read();
+
+        sdst = src.rawData();
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_CBRANCH_JOIN class methods ---
+
+    Inst_SOP1__S_CBRANCH_JOIN::Inst_SOP1__S_CBRANCH_JOIN(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_cbranch_join")
+    {
+        setFlag(Branch);
+        setFlag(WritesEXEC);
+    } // Inst_SOP1__S_CBRANCH_JOIN
+
+    Inst_SOP1__S_CBRANCH_JOIN::~Inst_SOP1__S_CBRANCH_JOIN()
+    {
+    } // ~Inst_SOP1__S_CBRANCH_JOIN
+
+    // --- description from .arch file ---
+    // saved_csp = S0.u;
+    // if (CSP == saved_csp) then
+    //     PC += 4; // Second time to JOIN: continue with program.
+    // else
+    //     CSP -= 1; // First time to JOIN; jump to other FORK path.
+    //     {PC, EXEC} = SGPR[CSP * 4]; // Read 128 bits from 4 consecutive
+    //     SGPRs.
+    // end
+    // Conditional branch join point (end of conditional branch block). S0 is
+    // saved CSP value.
+    // See S_CBRANCH_G_FORK and S_CBRANCH_I_FORK for related instructions.
+    void
+    Inst_SOP1__S_CBRANCH_JOIN::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOP1__S_ABS_I32 class methods ---
+
+    Inst_SOP1__S_ABS_I32::Inst_SOP1__S_ABS_I32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_abs_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_ABS_I32
+
+    Inst_SOP1__S_ABS_I32::~Inst_SOP1__S_ABS_I32()
+    {
+    } // ~Inst_SOP1__S_ABS_I32
+
+    // --- description from .arch file ---
+    // if (S.i < 0) then D.i = -S.i;
+    // else D.i = S.i;
+    // SCC = 1 if result is non-zero.
+    // Integer absolute value.
+    void
+    Inst_SOP1__S_ABS_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = std::abs(src.rawData());
+
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_MOV_FED_B32 class methods ---
+
+    Inst_SOP1__S_MOV_FED_B32::Inst_SOP1__S_MOV_FED_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_mov_fed_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_MOV_FED_B32
+
+    Inst_SOP1__S_MOV_FED_B32::~Inst_SOP1__S_MOV_FED_B32()
+    {
+    } // ~Inst_SOP1__S_MOV_FED_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u. Introduce an EDC double-detect error on write to the
+    // destination SGPR.
+    void
+    Inst_SOP1__S_MOV_FED_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOP1__S_SET_GPR_IDX_IDX class methods ---
+
+    Inst_SOP1__S_SET_GPR_IDX_IDX::Inst_SOP1__S_SET_GPR_IDX_IDX(
+          InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_set_gpr_idx_idx")
+    {
+    } // Inst_SOP1__S_SET_GPR_IDX_IDX
+
+    Inst_SOP1__S_SET_GPR_IDX_IDX::~Inst_SOP1__S_SET_GPR_IDX_IDX()
+    {
+    } // ~Inst_SOP1__S_SET_GPR_IDX_IDX
+
+    // --- description from .arch file ---
+    // M0[7:0] = S0.u[7:0].
+    // Modify the index used in vector GPR indexing.
+    void
+    Inst_SOP1__S_SET_GPR_IDX_IDX::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/sop2.cc b/src/arch/amdgpu/vega/insts/sop2.cc
new file mode 100644
index 0000000000..93618b2124
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/sop2.cc
@@ -0,0 +1,1555 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_SOP2__S_ADD_U32 class methods ---
+
+    Inst_SOP2__S_ADD_U32::Inst_SOP2__S_ADD_U32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_add_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ADD_U32
+
+    Inst_SOP2__S_ADD_U32::~Inst_SOP2__S_ADD_U32()
+    {
+    } // ~Inst_SOP2__S_ADD_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u + S1.u;
+    // SCC = (S0.u + S1.u >= 0x800000000ULL ? 1 : 0) is an unsigned
+    // ---  overflow/carry-out for S_ADDC_U32.
+    void
+    Inst_SOP2__S_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() + src1.rawData();
+        scc = ((ScalarRegU64)src0.rawData() + (ScalarRegU64)src1.rawData())
+            >= 0x100000000ULL ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_SUB_U32 class methods ---
+
+    Inst_SOP2__S_SUB_U32::Inst_SOP2__S_SUB_U32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_sub_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_SUB_U32
+
+    Inst_SOP2__S_SUB_U32::~Inst_SOP2__S_SUB_U32()
+    {
+    } // ~Inst_SOP2__S_SUB_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u - S1.u;
+    // SCC = (S1.u > S0.u ? 1 : 0) is an unsigned overflow or carry-out for
+    // ---  S_SUBB_U32.
+    void
+    Inst_SOP2__S_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() - src1.rawData();
+        scc = (src1.rawData() > src0.rawData()) ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_ADD_I32 class methods ---
+
+    Inst_SOP2__S_ADD_I32::Inst_SOP2__S_ADD_I32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_add_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ADD_I32
+
+    Inst_SOP2__S_ADD_I32::~Inst_SOP2__S_ADD_I32()
+    {
+    } // ~Inst_SOP2__S_ADD_I32
+
+    // --- description from .arch file ---
+    // D.i = S0.i + S1.i;
+    // SCC = (S0.u[31] == S1.u[31] && S0.u[31] != D.u[31]) is a signed
+    // overflow.
+    // This opcode is not suitable for use with S_ADDC_U32 for implementing
+    // 64-bit operations.
+    void
+    Inst_SOP2__S_ADD_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() + src1.rawData();
+        scc = (bits(src0.rawData(), 31) == bits(src1.rawData(), 31)
+            && bits(src0.rawData(), 31) != bits(sdst.rawData(), 31))
+            ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_SUB_I32 class methods ---
+
+    Inst_SOP2__S_SUB_I32::Inst_SOP2__S_SUB_I32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_sub_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_SUB_I32
+
+    Inst_SOP2__S_SUB_I32::~Inst_SOP2__S_SUB_I32()
+    {
+    } // ~Inst_SOP2__S_SUB_I32
+
+    // --- description from .arch file ---
+    // D.i = S0.i - S1.i;
+    // SCC = (S0.u[31] != S1.u[31] && S0.u[31] != D.u[31]) is a signed
+    // overflow.
+    // CAUTION: The condition code behaviour for this opcode is inconsistent
+    // with V_SUB_I32; see V_SUB_I32 for further details.
+    // This opcode is not suitable for use with S_SUBB_U32 for implementing
+    // 64-bit operations.
+    void
+    Inst_SOP2__S_SUB_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() - src1.rawData();
+        scc = (bits(src0.rawData(), 31) != bits(src1.rawData(), 31)
+            && bits(src0.rawData(), 31) != bits(sdst.rawData(), 31)) ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_ADDC_U32 class methods ---
+
+    Inst_SOP2__S_ADDC_U32::Inst_SOP2__S_ADDC_U32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_addc_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ADDC_U32
+
+    Inst_SOP2__S_ADDC_U32::~Inst_SOP2__S_ADDC_U32()
+    {
+    } // ~Inst_SOP2__S_ADDC_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u + S1.u + SCC;
+    // SCC = (S0.u + S1.u + SCC >= 0x800000000ULL ? 1 : 0) is an unsigned
+    // overflow.
+    void
+    Inst_SOP2__S_ADDC_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+        scc.read();
+
+        sdst = src0.rawData() + src1.rawData() + scc.rawData();
+        scc = ((ScalarRegU64)src0.rawData() + (ScalarRegU64)src1.rawData()
+            + (ScalarRegU64)scc.rawData()) >= 0x100000000ULL ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_SUBB_U32 class methods ---
+
+    Inst_SOP2__S_SUBB_U32::Inst_SOP2__S_SUBB_U32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_subb_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_SUBB_U32
+
+    Inst_SOP2__S_SUBB_U32::~Inst_SOP2__S_SUBB_U32()
+    {
+    } // ~Inst_SOP2__S_SUBB_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u - S1.u - SCC;
+    // SCC = (S1.u + SCC > S0.u ? 1 : 0) is an unsigned overflow.
+    void
+    Inst_SOP2__S_SUBB_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+        scc.read();
+
+        sdst = src0.rawData() - src1.rawData() - scc.rawData();
+        scc = (src1.rawData() + scc.rawData()) > src0.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_MIN_I32 class methods ---
+
+    Inst_SOP2__S_MIN_I32::Inst_SOP2__S_MIN_I32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_min_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_MIN_I32
+
+    Inst_SOP2__S_MIN_I32::~Inst_SOP2__S_MIN_I32()
+    {
+    } // ~Inst_SOP2__S_MIN_I32
+
+    // --- description from .arch file ---
+    // D.i = (S0.i < S1.i) ? S0.i : S1.i;
+    // SCC = 1 if S0 is chosen as the minimum value.
+    void
+    Inst_SOP2__S_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = std::min(src0.rawData(), src1.rawData());
+        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_MIN_U32 class methods ---
+
+    Inst_SOP2__S_MIN_U32::Inst_SOP2__S_MIN_U32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_min_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_MIN_U32
+
+    Inst_SOP2__S_MIN_U32::~Inst_SOP2__S_MIN_U32()
+    {
+    } // ~Inst_SOP2__S_MIN_U32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u < S1.u) ? S0.u : S1.u;
+    // SCC = 1 if S0 is chosen as the minimum value.
+    void
+    Inst_SOP2__S_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = std::min(src0.rawData(), src1.rawData());
+        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_MAX_I32 class methods ---
+
+    Inst_SOP2__S_MAX_I32::Inst_SOP2__S_MAX_I32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_max_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_MAX_I32
+
+    Inst_SOP2__S_MAX_I32::~Inst_SOP2__S_MAX_I32()
+    {
+    } // ~Inst_SOP2__S_MAX_I32
+
+    // --- description from .arch file ---
+    // D.i = (S0.i > S1.i) ? S0.i : S1.i;
+    // SCC = 1 if S0 is chosen as the maximum value.
+    void
+    Inst_SOP2__S_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = std::max(src0.rawData(), src1.rawData());
+        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_MAX_U32 class methods ---
+
+    Inst_SOP2__S_MAX_U32::Inst_SOP2__S_MAX_U32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_max_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_MAX_U32
+
+    Inst_SOP2__S_MAX_U32::~Inst_SOP2__S_MAX_U32()
+    {
+    } // ~Inst_SOP2__S_MAX_U32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u > S1.u) ? S0.u : S1.u;
+    // SCC = 1 if S0 is chosen as the maximum value.
+    void
+    Inst_SOP2__S_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = std::max(src0.rawData(), src1.rawData());
+        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_CSELECT_B32 class methods ---
+
+    Inst_SOP2__S_CSELECT_B32::Inst_SOP2__S_CSELECT_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_cselect_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_CSELECT_B32
+
+    Inst_SOP2__S_CSELECT_B32::~Inst_SOP2__S_CSELECT_B32()
+    {
+    } // ~Inst_SOP2__S_CSELECT_B32
+
+    // --- description from .arch file ---
+    // D.u = SCC ? S0.u : S1.u (conditional select).
+    void
+    Inst_SOP2__S_CSELECT_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+        scc.read();
+
+        sdst = scc.rawData() ? src0.rawData() : src1.rawData();
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP2__S_CSELECT_B64 class methods ---
+
+    Inst_SOP2__S_CSELECT_B64::Inst_SOP2__S_CSELECT_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_cselect_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_CSELECT_B64
+
+    Inst_SOP2__S_CSELECT_B64::~Inst_SOP2__S_CSELECT_B64()
+    {
+    } // ~Inst_SOP2__S_CSELECT_B64
+
+    // --- description from .arch file ---
+    // D.u64 = SCC ? S0.u64 : S1.u64 (conditional select).
+    void
+    Inst_SOP2__S_CSELECT_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+        scc.read();
+
+        sdst = scc.rawData() ? src0.rawData() : src1.rawData();
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP2__S_AND_B32 class methods ---
+
+    Inst_SOP2__S_AND_B32::Inst_SOP2__S_AND_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_and_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_AND_B32
+
+    Inst_SOP2__S_AND_B32::~Inst_SOP2__S_AND_B32()
+    {
+    } // ~Inst_SOP2__S_AND_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u & S1.u;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_AND_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() & src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_AND_B64 class methods ---
+
+    Inst_SOP2__S_AND_B64::Inst_SOP2__S_AND_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_and_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_AND_B64
+
+    Inst_SOP2__S_AND_B64::~Inst_SOP2__S_AND_B64()
+    {
+    } // ~Inst_SOP2__S_AND_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S0.u64 & S1.u64;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_AND_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() & src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_OR_B32 class methods ---
+
+    Inst_SOP2__S_OR_B32::Inst_SOP2__S_OR_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_or_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_OR_B32
+
+    Inst_SOP2__S_OR_B32::~Inst_SOP2__S_OR_B32()
+    {
+    } // ~Inst_SOP2__S_OR_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u | S1.u;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_OR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() | src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_OR_B64 class methods ---
+
+    Inst_SOP2__S_OR_B64::Inst_SOP2__S_OR_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_or_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_OR_B64
+
+    Inst_SOP2__S_OR_B64::~Inst_SOP2__S_OR_B64()
+    {
+    } // ~Inst_SOP2__S_OR_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S0.u64 | S1.u64;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_OR_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() | src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_XOR_B32 class methods ---
+
+    Inst_SOP2__S_XOR_B32::Inst_SOP2__S_XOR_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_xor_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_XOR_B32
+
+    Inst_SOP2__S_XOR_B32::~Inst_SOP2__S_XOR_B32()
+    {
+    } // ~Inst_SOP2__S_XOR_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u ^ S1.u;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() ^ src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_XOR_B64 class methods ---
+
+    Inst_SOP2__S_XOR_B64::Inst_SOP2__S_XOR_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_xor_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_XOR_B64
+
+    Inst_SOP2__S_XOR_B64::~Inst_SOP2__S_XOR_B64()
+    {
+    } // ~Inst_SOP2__S_XOR_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S0.u64 ^ S1.u64;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_XOR_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() ^ src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_ANDN2_B32 class methods ---
+
+    Inst_SOP2__S_ANDN2_B32::Inst_SOP2__S_ANDN2_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_andn2_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ANDN2_B32
+
+    Inst_SOP2__S_ANDN2_B32::~Inst_SOP2__S_ANDN2_B32()
+    {
+    } // ~Inst_SOP2__S_ANDN2_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u & ~S1.u;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_ANDN2_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() &~ src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_ANDN2_B64 class methods ---
+
+    Inst_SOP2__S_ANDN2_B64::Inst_SOP2__S_ANDN2_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_andn2_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ANDN2_B64
+
+    Inst_SOP2__S_ANDN2_B64::~Inst_SOP2__S_ANDN2_B64()
+    {
+    } // ~Inst_SOP2__S_ANDN2_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S0.u64 & ~S1.u64;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_ANDN2_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() &~ src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_ORN2_B32 class methods ---
+
+    Inst_SOP2__S_ORN2_B32::Inst_SOP2__S_ORN2_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_orn2_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ORN2_B32
+
+    Inst_SOP2__S_ORN2_B32::~Inst_SOP2__S_ORN2_B32()
+    {
+    } // ~Inst_SOP2__S_ORN2_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u | ~S1.u;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_ORN2_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() |~ src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_ORN2_B64 class methods ---
+
+    Inst_SOP2__S_ORN2_B64::Inst_SOP2__S_ORN2_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_orn2_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ORN2_B64
+
+    Inst_SOP2__S_ORN2_B64::~Inst_SOP2__S_ORN2_B64()
+    {
+    } // ~Inst_SOP2__S_ORN2_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S0.u64 | ~S1.u64;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_ORN2_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() |~ src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_NAND_B32 class methods ---
+
+    Inst_SOP2__S_NAND_B32::Inst_SOP2__S_NAND_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_nand_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_NAND_B32
+
+    Inst_SOP2__S_NAND_B32::~Inst_SOP2__S_NAND_B32()
+    {
+    } // ~Inst_SOP2__S_NAND_B32
+
+    // --- description from .arch file ---
+    // D.u = ~(S0.u & S1.u);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_NAND_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = ~(src0.rawData() & src1.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_NAND_B64 class methods ---
+
+    Inst_SOP2__S_NAND_B64::Inst_SOP2__S_NAND_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_nand_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_NAND_B64
+
+    Inst_SOP2__S_NAND_B64::~Inst_SOP2__S_NAND_B64()
+    {
+    } // ~Inst_SOP2__S_NAND_B64
+
+    // --- description from .arch file ---
+    // D.u64 = ~(S0.u64 & S1.u64);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_NAND_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = ~(src0.rawData() & src1.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_NOR_B32 class methods ---
+
+    Inst_SOP2__S_NOR_B32::Inst_SOP2__S_NOR_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_nor_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_NOR_B32
+
+    Inst_SOP2__S_NOR_B32::~Inst_SOP2__S_NOR_B32()
+    {
+    } // ~Inst_SOP2__S_NOR_B32
+
+    // --- description from .arch file ---
+    // D.u = ~(S0.u | S1.u);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_NOR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = ~(src0.rawData() | src1.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_NOR_B64 class methods ---
+
+    Inst_SOP2__S_NOR_B64::Inst_SOP2__S_NOR_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_nor_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_NOR_B64
+
+    Inst_SOP2__S_NOR_B64::~Inst_SOP2__S_NOR_B64()
+    {
+    } // ~Inst_SOP2__S_NOR_B64
+
+    // --- description from .arch file ---
+    // D.u64 = ~(S0.u64 | S1.u64);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_NOR_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = ~(src0.rawData() | src1.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_XNOR_B32 class methods ---
+
+    Inst_SOP2__S_XNOR_B32::Inst_SOP2__S_XNOR_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_xnor_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_XNOR_B32
+
+    Inst_SOP2__S_XNOR_B32::~Inst_SOP2__S_XNOR_B32()
+    {
+    } // ~Inst_SOP2__S_XNOR_B32
+
+    // --- description from .arch file ---
+    // D.u = ~(S0.u ^ S1.u);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_XNOR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = ~(src0.rawData() ^ src1.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_XNOR_B64 class methods ---
+
+    Inst_SOP2__S_XNOR_B64::Inst_SOP2__S_XNOR_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_xnor_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_XNOR_B64
+
+    Inst_SOP2__S_XNOR_B64::~Inst_SOP2__S_XNOR_B64()
+    {
+    } // ~Inst_SOP2__S_XNOR_B64
+
+    // --- description from .arch file ---
+    // D.u64 = ~(S0.u64 ^ S1.u64);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_XNOR_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = ~(src0.rawData() ^ src1.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_LSHL_B32 class methods ---
+
+    Inst_SOP2__S_LSHL_B32::Inst_SOP2__S_LSHL_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_lshl_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_LSHL_B32
+
+    Inst_SOP2__S_LSHL_B32::~Inst_SOP2__S_LSHL_B32()
+    {
+    } // ~Inst_SOP2__S_LSHL_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u << S1.u[4:0];
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_LSHL_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() << bits(src1.rawData(), 4, 0));
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_LSHL_B64 class methods ---
+
+    Inst_SOP2__S_LSHL_B64::Inst_SOP2__S_LSHL_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_lshl_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_LSHL_B64
+
+    Inst_SOP2__S_LSHL_B64::~Inst_SOP2__S_LSHL_B64()
+    {
+    } // ~Inst_SOP2__S_LSHL_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S0.u64 << S1.u[5:0];
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_LSHL_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() << bits(src1.rawData(), 5, 0));
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_LSHR_B32 class methods ---
+
+    Inst_SOP2__S_LSHR_B32::Inst_SOP2__S_LSHR_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_lshr_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_LSHR_B32
+
+    Inst_SOP2__S_LSHR_B32::~Inst_SOP2__S_LSHR_B32()
+    {
+    } // ~Inst_SOP2__S_LSHR_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u >> S1.u[4:0];
+    // SCC = 1 if result is non-zero.
+    // The vacated bits are set to zero.
+    void
+    Inst_SOP2__S_LSHR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0));
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_LSHR_B64 class methods ---
+
+    Inst_SOP2__S_LSHR_B64::Inst_SOP2__S_LSHR_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_lshr_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_LSHR_B64
+
+    Inst_SOP2__S_LSHR_B64::~Inst_SOP2__S_LSHR_B64()
+    {
+    } // ~Inst_SOP2__S_LSHR_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S0.u64 >> S1.u[5:0];
+    // SCC = 1 if result is non-zero.
+    // The vacated bits are set to zero.
+    void
+    Inst_SOP2__S_LSHR_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0));
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_ASHR_I32 class methods ---
+
+    Inst_SOP2__S_ASHR_I32::Inst_SOP2__S_ASHR_I32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_ashr_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ASHR_I32
+
+    Inst_SOP2__S_ASHR_I32::~Inst_SOP2__S_ASHR_I32()
+    {
+    } // ~Inst_SOP2__S_ASHR_I32
+
+    // --- description from .arch file ---
+    // D.i = signext(S0.i) >> S1.u[4:0];
+    // SCC = 1 if result is non-zero.
+    // The vacated bits are set to the sign bit of the input value.
+    void
+    Inst_SOP2__S_ASHR_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0));
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_ASHR_I64 class methods ---
+
+    Inst_SOP2__S_ASHR_I64::Inst_SOP2__S_ASHR_I64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_ashr_i64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ASHR_I64
+
+    Inst_SOP2__S_ASHR_I64::~Inst_SOP2__S_ASHR_I64()
+    {
+    } // ~Inst_SOP2__S_ASHR_I64
+
+    // --- description from .arch file ---
+    // D.i64 = signext(S0.i64) >> S1.u[5:0];
+    // SCC = 1 if result is non-zero.
+    // The vacated bits are set to the sign bit of the input value.
+    void
+    Inst_SOP2__S_ASHR_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0));
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_BFM_B32 class methods ---
+
+    Inst_SOP2__S_BFM_B32::Inst_SOP2__S_BFM_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_bfm_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_BFM_B32
+
+    Inst_SOP2__S_BFM_B32::~Inst_SOP2__S_BFM_B32()
+    {
+    } // ~Inst_SOP2__S_BFM_B32
+
+    // --- description from .arch file ---
+    // D.u = ((1 << S0.u[4:0]) - 1) << S1.u[4:0] (bitfield mask).
+    void
+    Inst_SOP2__S_BFM_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+
+        src0.read();
+        src1.read();
+
+        sdst = ((1 << bits(src0.rawData(), 4, 0)) - 1)
+            << bits(src1.rawData(), 4, 0);
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP2__S_BFM_B64 class methods ---
+
+    Inst_SOP2__S_BFM_B64::Inst_SOP2__S_BFM_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_bfm_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_BFM_B64
+
+    Inst_SOP2__S_BFM_B64::~Inst_SOP2__S_BFM_B64()
+    {
+    } // ~Inst_SOP2__S_BFM_B64
+
+    // --- description from .arch file ---
+    // D.u64 = ((1ULL << S0.u[5:0]) - 1) << S1.u[5:0] (bitfield mask).
+    void
+    Inst_SOP2__S_BFM_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+
+        src0.read();
+        src1.read();
+
+        sdst = ((1ULL << bits(src0.rawData(), 5, 0)) - 1)
+            << bits(src1.rawData(), 5, 0);
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP2__S_MUL_I32 class methods ---
+
+    Inst_SOP2__S_MUL_I32::Inst_SOP2__S_MUL_I32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_mul_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_MUL_I32
+
+    Inst_SOP2__S_MUL_I32::~Inst_SOP2__S_MUL_I32()
+    {
+    } // ~Inst_SOP2__S_MUL_I32
+
+    // --- description from .arch file ---
+    // D.i = S0.i * S1.i.
+    void
+    Inst_SOP2__S_MUL_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() * src1.rawData();
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP2__S_BFE_U32 class methods ---
+
+    Inst_SOP2__S_BFE_U32::Inst_SOP2__S_BFE_U32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_bfe_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_BFE_U32
+
+    Inst_SOP2__S_BFE_U32::~Inst_SOP2__S_BFE_U32()
+    {
+    } // ~Inst_SOP2__S_BFE_U32
+
+    // --- description from .arch file ---
+    // Bit field extract. S0 is Data, S1[4:0] is field offset, S1[22:16] is
+    // field width.
+    // D.u = (S0.u>>S1.u[4:0]) & ((1<<S1.u[22:16])-1);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_BFE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0))
+            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_BFE_I32 class methods ---
+
+    Inst_SOP2__S_BFE_I32::Inst_SOP2__S_BFE_I32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_bfe_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_BFE_I32
+
+    Inst_SOP2__S_BFE_I32::~Inst_SOP2__S_BFE_I32()
+    {
+    } // ~Inst_SOP2__S_BFE_I32
+
+    // --- description from .arch file ---
+    // Bit field extract. S0 is Data, S1[4:0] is field offset, S1[22:16] is
+    // field width.
+    // D.i = (S0.i>>S1.u[4:0]) & ((1<<S1.u[22:16])-1);
+    // Sign-extend the result;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_BFE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0))
+            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
+
+        // Above extracted a signed int of size src1[22:16] bits which needs
+        // to be signed-extended. Check if the MSB of our src1[22:16]-bit
+        // integer is 1, and sign extend it is.
+        //
+        // Note: The description in the Vega ISA manual does not mention to
+        // sign-extend the result. An update description can be found in the
+        // more recent RDNA3 manual here:
+        // https://developer.amd.com/wp-content/resources/
+        //      RDNA3_Shader_ISA_December2022.pdf
+        if (sdst.rawData() >> (bits(src1.rawData(), 22, 16) - 1)) {
+            sdst = sdst.rawData()
+                 | (0xffffffff << bits(src1.rawData(), 22, 16));
+        }
+
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_BFE_U64 class methods ---
+
+    Inst_SOP2__S_BFE_U64::Inst_SOP2__S_BFE_U64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_bfe_u64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_BFE_U64
+
+    Inst_SOP2__S_BFE_U64::~Inst_SOP2__S_BFE_U64()
+    {
+    } // ~Inst_SOP2__S_BFE_U64
+
+    // --- description from .arch file ---
+    // Bit field extract. S0 is Data, S1[5:0] is field offset, S1[22:16] is
+    // field width.
+    // D.u64 = (S0.u64>>S1.u[5:0]) & ((1<<S1.u[22:16])-1);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_BFE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0))
+            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_BFE_I64 class methods ---
+
+    Inst_SOP2__S_BFE_I64::Inst_SOP2__S_BFE_I64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_bfe_i64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_BFE_I64
+
+    Inst_SOP2__S_BFE_I64::~Inst_SOP2__S_BFE_I64()
+    {
+    } // ~Inst_SOP2__S_BFE_I64
+
+    // --- description from .arch file ---
+    // Bit field extract. S0 is Data, S1[5:0] is field offset, S1[22:16] is
+    // field width.
+    // D.i64 = (S0.i64>>S1.u[5:0]) & ((1<<S1.u[22:16])-1);
+    // Sign-extend result;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_BFE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0))
+            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
+
+        // Above extracted a signed int of size src1[22:16] bits which needs
+        // to be signed-extended. Check if the MSB of our src1[22:16]-bit
+        // integer is 1, and sign extend it is.
+        if (sdst.rawData() >> (bits(src1.rawData(), 22, 16) - 1)) {
+            sdst = sdst.rawData()
+                 | 0xffffffffffffffff << bits(src1.rawData(), 22, 16);
+        }
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_CBRANCH_G_FORK class methods ---
+
+    Inst_SOP2__S_CBRANCH_G_FORK::Inst_SOP2__S_CBRANCH_G_FORK(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_cbranch_g_fork")
+    {
+        setFlag(Branch);
+    } // Inst_SOP2__S_CBRANCH_G_FORK
+
+    Inst_SOP2__S_CBRANCH_G_FORK::~Inst_SOP2__S_CBRANCH_G_FORK()
+    {
+    } // ~Inst_SOP2__S_CBRANCH_G_FORK
+
+    // --- description from .arch file ---
+    // mask_pass = S0.u64 & EXEC;
+    // mask_fail = ~S0.u64 & EXEC;
+    // if (mask_pass == EXEC)
+    //     PC = S1.u64;
+    // elsif (mask_fail == EXEC)
+    //     PC += 4;
+    // elsif (bitcount(mask_fail) < bitcount(mask_pass))
+    //     EXEC = mask_fail;
+    //     SGPR[CSP*4] = { S1.u64, mask_pass };
+    //     CSP++;
+    //     PC += 4;
+    // else
+    //     EXEC = mask_pass;
+    //     SGPR[CSP*4] = { PC + 4, mask_fail };
+    //     CSP++;
+    //     PC = S1.u64;
+    // end.
+    // Conditional branch using branch-stack.
+    // S0 = compare mask(vcc or any sgpr) and
+    // S1 = 64-bit byte address of target instruction.
+    // See also S_CBRANCH_JOIN.
+    void
+    Inst_SOP2__S_CBRANCH_G_FORK::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOP2__S_ABSDIFF_I32 class methods ---
+
+    Inst_SOP2__S_ABSDIFF_I32::Inst_SOP2__S_ABSDIFF_I32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_absdiff_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ABSDIFF_I32
+
+    Inst_SOP2__S_ABSDIFF_I32::~Inst_SOP2__S_ABSDIFF_I32()
+    {
+    } // ~Inst_SOP2__S_ABSDIFF_I32
+
+    // --- description from .arch file ---
+    // D.i = S0.i - S1.i;
+    // if (D.i < 0) then D.i = -D.i;
+    // SCC = 1 if result is non-zero.
+    // Compute the absolute value of difference between two values.
+    void
+    Inst_SOP2__S_ABSDIFF_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        sdst = std::abs(src0.rawData() - src1.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_RFE_RESTORE_B64 class methods ---
+
+    Inst_SOP2__S_RFE_RESTORE_B64::Inst_SOP2__S_RFE_RESTORE_B64(
+          InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_rfe_restore_b64")
+    {
+    } // Inst_SOP2__S_RFE_RESTORE_B64
+
+    Inst_SOP2__S_RFE_RESTORE_B64::~Inst_SOP2__S_RFE_RESTORE_B64()
+    {
+    } // ~Inst_SOP2__S_RFE_RESTORE_B64
+
+    // --- description from .arch file ---
+    // PRIV = 0;
+    // PC = S0.u64;
+    // INST_ATC = S1.u32[0].
+    // Return from exception handler and continue, possibly changing the
+    // ---  instruction ATC mode.
+    // This instruction may only be used within a trap handler.
+    // Use this instruction when the main program may be in a different memory
+    // ---  space than the trap handler.
+    void
+    Inst_SOP2__S_RFE_RESTORE_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOP2__S_MUL_HI_U32 class methods ---
+
+    Inst_SOP2__S_MUL_HI_U32::Inst_SOP2__S_MUL_HI_U32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_mul_hi_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_MUL_HI_U32
+
+    Inst_SOP2__S_MUL_HI_U32::~Inst_SOP2__S_MUL_HI_U32()
+    {
+    } // ~Inst_SOP2__S_MUL_HI_U32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u * S1.u) >> 32;
+    void
+    Inst_SOP2__S_MUL_HI_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+
+        src0.read();
+        src1.read();
+
+        VecElemU64 tmp_dst =
+            ((VecElemU64)src0.rawData() * (VecElemU64)src1.rawData());
+        sdst = (tmp_dst >> 32);
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP2__S_MUL_HI_I32 class methods ---
+
+    Inst_SOP2__S_MUL_HI_I32::Inst_SOP2__S_MUL_HI_I32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_mul_hi_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_MUL_HI_I32
+
+    Inst_SOP2__S_MUL_HI_I32::~Inst_SOP2__S_MUL_HI_I32()
+    {
+    } // ~Inst_SOP2__S_MUL_HI_I32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u * S1.u) >> 32;
+    void
+    Inst_SOP2__S_MUL_HI_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src0.read();
+        src1.read();
+
+        VecElemI64 tmp_src0 =
+            sext<std::numeric_limits<VecElemI64>::digits>(src0.rawData());
+        VecElemI64 tmp_src1 =
+            sext<std::numeric_limits<VecElemI64>::digits>(src1.rawData());
+        sdst = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);
+
+        sdst.write();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/sopc.cc b/src/arch/amdgpu/vega/insts/sopc.cc
new file mode 100644
index 0000000000..9c58688e53
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/sopc.cc
@@ -0,0 +1,599 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_SOPC__S_CMP_EQ_I32 class methods ---
+
+    Inst_SOPC__S_CMP_EQ_I32::Inst_SOPC__S_CMP_EQ_I32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_eq_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_EQ_I32
+
+    Inst_SOPC__S_CMP_EQ_I32::~Inst_SOPC__S_CMP_EQ_I32()
+    {
+    } // ~Inst_SOPC__S_CMP_EQ_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i == S1.i).
+    void
+    Inst_SOPC__S_CMP_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() == src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_LG_I32 class methods ---
+
+    Inst_SOPC__S_CMP_LG_I32::Inst_SOPC__S_CMP_LG_I32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_lg_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_LG_I32
+
+    Inst_SOPC__S_CMP_LG_I32::~Inst_SOPC__S_CMP_LG_I32()
+    {
+    } // ~Inst_SOPC__S_CMP_LG_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i != S1.i).
+    void
+    Inst_SOPC__S_CMP_LG_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() != src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_GT_I32 class methods ---
+
+    Inst_SOPC__S_CMP_GT_I32::Inst_SOPC__S_CMP_GT_I32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_gt_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_GT_I32
+
+    Inst_SOPC__S_CMP_GT_I32::~Inst_SOPC__S_CMP_GT_I32()
+    {
+    } // ~Inst_SOPC__S_CMP_GT_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i > S1.i).
+    void
+    Inst_SOPC__S_CMP_GT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_GE_I32 class methods ---
+
+    Inst_SOPC__S_CMP_GE_I32::Inst_SOPC__S_CMP_GE_I32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_ge_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_GE_I32
+
+    Inst_SOPC__S_CMP_GE_I32::~Inst_SOPC__S_CMP_GE_I32()
+    {
+    } // ~Inst_SOPC__S_CMP_GE_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i >= S1.i).
+    void
+    Inst_SOPC__S_CMP_GE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() >= src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_LT_I32 class methods ---
+
+    Inst_SOPC__S_CMP_LT_I32::Inst_SOPC__S_CMP_LT_I32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_lt_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_LT_I32
+
+    Inst_SOPC__S_CMP_LT_I32::~Inst_SOPC__S_CMP_LT_I32()
+    {
+    } // ~Inst_SOPC__S_CMP_LT_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i < S1.i).
+    void
+    Inst_SOPC__S_CMP_LT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_LE_I32 class methods ---
+
+    Inst_SOPC__S_CMP_LE_I32::Inst_SOPC__S_CMP_LE_I32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_le_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_LE_I32
+
+    Inst_SOPC__S_CMP_LE_I32::~Inst_SOPC__S_CMP_LE_I32()
+    {
+    } // ~Inst_SOPC__S_CMP_LE_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i <= S1.i).
+    void
+    Inst_SOPC__S_CMP_LE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() <= src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_EQ_U32 class methods ---
+
+    Inst_SOPC__S_CMP_EQ_U32::Inst_SOPC__S_CMP_EQ_U32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_eq_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_EQ_U32
+
+    Inst_SOPC__S_CMP_EQ_U32::~Inst_SOPC__S_CMP_EQ_U32()
+    {
+    } // ~Inst_SOPC__S_CMP_EQ_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u == S1.u).
+    void
+    Inst_SOPC__S_CMP_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() == src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_LG_U32 class methods ---
+
+    Inst_SOPC__S_CMP_LG_U32::Inst_SOPC__S_CMP_LG_U32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_lg_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_LG_U32
+
+    Inst_SOPC__S_CMP_LG_U32::~Inst_SOPC__S_CMP_LG_U32()
+    {
+    } // ~Inst_SOPC__S_CMP_LG_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u != S1.u).
+    void
+    Inst_SOPC__S_CMP_LG_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() != src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_GT_U32 class methods ---
+
+    Inst_SOPC__S_CMP_GT_U32::Inst_SOPC__S_CMP_GT_U32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_gt_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_GT_U32
+
+    Inst_SOPC__S_CMP_GT_U32::~Inst_SOPC__S_CMP_GT_U32()
+    {
+    } // ~Inst_SOPC__S_CMP_GT_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u > S1.u).
+    void
+    Inst_SOPC__S_CMP_GT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_GE_U32 class methods ---
+
+    Inst_SOPC__S_CMP_GE_U32::Inst_SOPC__S_CMP_GE_U32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_ge_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_GE_U32
+
+    Inst_SOPC__S_CMP_GE_U32::~Inst_SOPC__S_CMP_GE_U32()
+    {
+    } // ~Inst_SOPC__S_CMP_GE_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u >= S1.u).
+    void
+    Inst_SOPC__S_CMP_GE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() >= src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_LT_U32 class methods ---
+
+    Inst_SOPC__S_CMP_LT_U32::Inst_SOPC__S_CMP_LT_U32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_lt_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_LT_U32
+
+    Inst_SOPC__S_CMP_LT_U32::~Inst_SOPC__S_CMP_LT_U32()
+    {
+    } // ~Inst_SOPC__S_CMP_LT_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u < S1.u).
+    void
+    Inst_SOPC__S_CMP_LT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_LE_U32 class methods ---
+
+    Inst_SOPC__S_CMP_LE_U32::Inst_SOPC__S_CMP_LE_U32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_le_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_LE_U32
+
+    Inst_SOPC__S_CMP_LE_U32::~Inst_SOPC__S_CMP_LE_U32()
+    {
+    } // ~Inst_SOPC__S_CMP_LE_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u <= S1.u).
+    void
+    Inst_SOPC__S_CMP_LE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() <= src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_BITCMP0_B32 class methods ---
+
+    Inst_SOPC__S_BITCMP0_B32::Inst_SOPC__S_BITCMP0_B32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_bitcmp0_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_BITCMP0_B32
+
+    Inst_SOPC__S_BITCMP0_B32::~Inst_SOPC__S_BITCMP0_B32()
+    {
+    } // ~Inst_SOPC__S_BITCMP0_B32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u[S1.u[4:0]] == 0).
+    void
+    Inst_SOPC__S_BITCMP0_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = !bits(src0.rawData(), bits(src1.rawData(), 4, 0)) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_BITCMP1_B32 class methods ---
+
+    Inst_SOPC__S_BITCMP1_B32::Inst_SOPC__S_BITCMP1_B32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_bitcmp1_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_BITCMP1_B32
+
+    Inst_SOPC__S_BITCMP1_B32::~Inst_SOPC__S_BITCMP1_B32()
+    {
+    } // ~Inst_SOPC__S_BITCMP1_B32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u[S1.u[4:0]] == 1).
+    void
+    Inst_SOPC__S_BITCMP1_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = bits(src0.rawData(), bits(src1.rawData(), 4, 0)) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_BITCMP0_B64 class methods ---
+
+    Inst_SOPC__S_BITCMP0_B64::Inst_SOPC__S_BITCMP0_B64(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_bitcmp0_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_BITCMP0_B64
+
+    Inst_SOPC__S_BITCMP0_B64::~Inst_SOPC__S_BITCMP0_B64()
+    {
+    } // ~Inst_SOPC__S_BITCMP0_B64
+
+    // --- description from .arch file ---
+    // SCC = (S0.u64[S1.u[5:0]] == 0).
+    void
+    Inst_SOPC__S_BITCMP0_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = !bits(src0.rawData(), bits(src1.rawData(), 5, 0)) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_BITCMP1_B64 class methods ---
+
+    Inst_SOPC__S_BITCMP1_B64::Inst_SOPC__S_BITCMP1_B64(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_bitcmp1_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_BITCMP1_B64
+
+    Inst_SOPC__S_BITCMP1_B64::~Inst_SOPC__S_BITCMP1_B64()
+    {
+    } // ~Inst_SOPC__S_BITCMP1_B64
+
+    // --- description from .arch file ---
+    // SCC = (S0.u64[S1.u[5:0]] == 1).
+    void
+    Inst_SOPC__S_BITCMP1_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = bits(src0.rawData(), bits(src1.rawData(), 5, 0)) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_SETVSKIP class methods ---
+
+    Inst_SOPC__S_SETVSKIP::Inst_SOPC__S_SETVSKIP(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_setvskip")
+    {
+    } // Inst_SOPC__S_SETVSKIP
+
+    Inst_SOPC__S_SETVSKIP::~Inst_SOPC__S_SETVSKIP()
+    {
+    } // ~Inst_SOPC__S_SETVSKIP
+
+    // --- description from .arch file ---
+    // VSKIP = S0.u[S1.u[4:0]].
+    // Enables and disables VSKIP mode.
+    // When VSKIP is enabled, no VOP*/M*BUF/MIMG/DS/FLAT/EXP instuctions are
+    // issued.
+    // If any vector operations are outstanding, S_WAITCNT must be issued
+    // before executing.
+    // This instruction requires one waitstate after executing (e.g. S_NOP 0).
+    // Example:
+    //     s_waitcnt 0
+    //     s_setvskip 1, 0  // Enable vskip mode.
+    //     s_nop 1
+    void
+    Inst_SOPC__S_SETVSKIP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPC__S_SET_GPR_IDX_ON class methods ---
+
+    Inst_SOPC__S_SET_GPR_IDX_ON::Inst_SOPC__S_SET_GPR_IDX_ON(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_set_gpr_idx_on")
+    {
+    } // Inst_SOPC__S_SET_GPR_IDX_ON
+
+    Inst_SOPC__S_SET_GPR_IDX_ON::~Inst_SOPC__S_SET_GPR_IDX_ON()
+    {
+    } // ~Inst_SOPC__S_SET_GPR_IDX_ON
+
+    // --- description from .arch file ---
+    // MODE.gpr_idx_en = 1;
+    // M0[7:0] = S0.u[7:0];
+    // M0[15:12] = SIMM4 (direct contents of S1 field);
+    // // Remaining bits of M0 are unmodified.
+    // Enable GPR indexing mode. Vector operations after this will perform
+    // relative GPR addressing based on the contents of M0. The structure
+    // SQ_M0_GPR_IDX_WORD may be used to decode M0.
+    // The raw contents of the S1 field are read and used to set the enable
+    // bits. S1[0] = VSRC0_REL, S1[1] = VSRC1_REL, S1[2] = VSRC2_REL and
+    // S1[3] = VDST_REL.
+    void
+    Inst_SOPC__S_SET_GPR_IDX_ON::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPC__S_CMP_EQ_U64 class methods ---
+
+    Inst_SOPC__S_CMP_EQ_U64::Inst_SOPC__S_CMP_EQ_U64(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_eq_u64")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_EQ_U64
+
+    Inst_SOPC__S_CMP_EQ_U64::~Inst_SOPC__S_CMP_EQ_U64()
+    {
+    } // ~Inst_SOPC__S_CMP_EQ_U64
+
+    // --- description from .arch file ---
+    // SCC = (S0.i64 == S1.i64).
+    void
+    Inst_SOPC__S_CMP_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() == src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_LG_U64 class methods ---
+
+    Inst_SOPC__S_CMP_LG_U64::Inst_SOPC__S_CMP_LG_U64(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_lg_u64")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_LG_U64
+
+    Inst_SOPC__S_CMP_LG_U64::~Inst_SOPC__S_CMP_LG_U64()
+    {
+    } // ~Inst_SOPC__S_CMP_LG_U64
+
+    // --- description from .arch file ---
+    // SCC = (S0.i64 != S1.i64).
+    void
+    Inst_SOPC__S_CMP_LG_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() != src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/sopk.cc b/src/arch/amdgpu/vega/insts/sopk.cc
new file mode 100644
index 0000000000..7abbb9abb4
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/sopk.cc
@@ -0,0 +1,648 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+#include "dev/amdgpu/hwreg_defines.hh"
+#include "gpu-compute/shader.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_SOPK__S_MOVK_I32 class methods ---
+
+    Inst_SOPK__S_MOVK_I32::Inst_SOPK__S_MOVK_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_movk_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_MOVK_I32
+
+    Inst_SOPK__S_MOVK_I32::~Inst_SOPK__S_MOVK_I32()
+    {
+    } // ~Inst_SOPK__S_MOVK_I32
+
+    // --- description from .arch file ---
+    // D.i = signext(SIMM16) (sign extension).
+    void
+    Inst_SOPK__S_MOVK_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        sdst = simm16;
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOPK__S_CMOVK_I32 class methods ---
+
+    Inst_SOPK__S_CMOVK_I32::Inst_SOPK__S_CMOVK_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmovk_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMOVK_I32
+
+    Inst_SOPK__S_CMOVK_I32::~Inst_SOPK__S_CMOVK_I32()
+    {
+    } // ~Inst_SOPK__S_CMOVK_I32
+
+    // --- description from .arch file ---
+    // if (SCC) then D.i = signext(SIMM16);
+    // else NOP.
+    // Conditional move with sign extension.
+    void
+    Inst_SOPK__S_CMOVK_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        scc.read();
+
+        if (scc.rawData()) {
+            sdst = simm16;
+            sdst.write();
+        }
+    } // execute
+    // --- Inst_SOPK__S_CMPK_EQ_I32 class methods ---
+
+    Inst_SOPK__S_CMPK_EQ_I32::Inst_SOPK__S_CMPK_EQ_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_eq_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_EQ_I32
+
+    Inst_SOPK__S_CMPK_EQ_I32::~Inst_SOPK__S_CMPK_EQ_I32()
+    {
+    } // ~Inst_SOPK__S_CMPK_EQ_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i == signext(SIMM16)).
+    void
+    Inst_SOPK__S_CMPK_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() == simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_LG_I32 class methods ---
+
+    Inst_SOPK__S_CMPK_LG_I32::Inst_SOPK__S_CMPK_LG_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_lg_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_LG_I32
+
+    Inst_SOPK__S_CMPK_LG_I32::~Inst_SOPK__S_CMPK_LG_I32()
+    {
+    } // ~Inst_SOPK__S_CMPK_LG_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i != signext(SIMM16)).
+    void
+    Inst_SOPK__S_CMPK_LG_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() != simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_GT_I32 class methods ---
+
+    Inst_SOPK__S_CMPK_GT_I32::Inst_SOPK__S_CMPK_GT_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_gt_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_GT_I32
+
+    Inst_SOPK__S_CMPK_GT_I32::~Inst_SOPK__S_CMPK_GT_I32()
+    {
+    } // ~Inst_SOPK__S_CMPK_GT_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i > signext(SIMM16)).
+    void
+    Inst_SOPK__S_CMPK_GT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() > simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_GE_I32 class methods ---
+
+    Inst_SOPK__S_CMPK_GE_I32::Inst_SOPK__S_CMPK_GE_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_ge_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_GE_I32
+
+    Inst_SOPK__S_CMPK_GE_I32::~Inst_SOPK__S_CMPK_GE_I32()
+    {
+    } // ~Inst_SOPK__S_CMPK_GE_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i >= signext(SIMM16)).
+    void
+    Inst_SOPK__S_CMPK_GE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() >= simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_LT_I32 class methods ---
+
+    Inst_SOPK__S_CMPK_LT_I32::Inst_SOPK__S_CMPK_LT_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_lt_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_LT_I32
+
+    Inst_SOPK__S_CMPK_LT_I32::~Inst_SOPK__S_CMPK_LT_I32()
+    {
+    } // ~Inst_SOPK__S_CMPK_LT_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i < signext(SIMM16)).
+    void
+    Inst_SOPK__S_CMPK_LT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() < simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_LE_I32 class methods ---
+
+    Inst_SOPK__S_CMPK_LE_I32::Inst_SOPK__S_CMPK_LE_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_le_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_LE_I32
+
+    Inst_SOPK__S_CMPK_LE_I32::~Inst_SOPK__S_CMPK_LE_I32()
+    {
+    } // ~Inst_SOPK__S_CMPK_LE_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i <= signext(SIMM16)).
+    void
+    Inst_SOPK__S_CMPK_LE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() <= simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_EQ_U32 class methods ---
+
+    Inst_SOPK__S_CMPK_EQ_U32::Inst_SOPK__S_CMPK_EQ_U32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_eq_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_EQ_U32
+
+    Inst_SOPK__S_CMPK_EQ_U32::~Inst_SOPK__S_CMPK_EQ_U32()
+    {
+    } // ~Inst_SOPK__S_CMPK_EQ_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u == SIMM16).
+    void
+    Inst_SOPK__S_CMPK_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
+        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() == simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_LG_U32 class methods ---
+
+    Inst_SOPK__S_CMPK_LG_U32::Inst_SOPK__S_CMPK_LG_U32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_lg_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_LG_U32
+
+    Inst_SOPK__S_CMPK_LG_U32::~Inst_SOPK__S_CMPK_LG_U32()
+    {
+    } // ~Inst_SOPK__S_CMPK_LG_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u != SIMM16).
+    void
+    Inst_SOPK__S_CMPK_LG_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
+        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() != simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_GT_U32 class methods ---
+
+    Inst_SOPK__S_CMPK_GT_U32::Inst_SOPK__S_CMPK_GT_U32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_gt_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_GT_U32
+
+    Inst_SOPK__S_CMPK_GT_U32::~Inst_SOPK__S_CMPK_GT_U32()
+    {
+    } // ~Inst_SOPK__S_CMPK_GT_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u > SIMM16).
+    void
+    Inst_SOPK__S_CMPK_GT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
+        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() > simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_GE_U32 class methods ---
+
+    Inst_SOPK__S_CMPK_GE_U32::Inst_SOPK__S_CMPK_GE_U32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_ge_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_GE_U32
+
+    Inst_SOPK__S_CMPK_GE_U32::~Inst_SOPK__S_CMPK_GE_U32()
+    {
+    } // ~Inst_SOPK__S_CMPK_GE_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u >= SIMM16).
+    void
+    Inst_SOPK__S_CMPK_GE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
+        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() >= simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_LT_U32 class methods ---
+
+    Inst_SOPK__S_CMPK_LT_U32::Inst_SOPK__S_CMPK_LT_U32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_lt_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_LT_U32
+
+    Inst_SOPK__S_CMPK_LT_U32::~Inst_SOPK__S_CMPK_LT_U32()
+    {
+    } // ~Inst_SOPK__S_CMPK_LT_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u < SIMM16).
+    void
+    Inst_SOPK__S_CMPK_LT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
+        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() < simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_LE_U32 class methods ---
+
+    Inst_SOPK__S_CMPK_LE_U32::Inst_SOPK__S_CMPK_LE_U32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_le_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_LE_U32
+
+    Inst_SOPK__S_CMPK_LE_U32::~Inst_SOPK__S_CMPK_LE_U32()
+    {
+    } // ~Inst_SOPK__S_CMPK_LE_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u <= SIMM16).
+    void
+    Inst_SOPK__S_CMPK_LE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
+        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() <= simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_ADDK_I32 class methods ---
+
+    Inst_SOPK__S_ADDK_I32::Inst_SOPK__S_ADDK_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_addk_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_ADDK_I32
+
+    Inst_SOPK__S_ADDK_I32::~Inst_SOPK__S_ADDK_I32()
+    {
+    } // ~Inst_SOPK__S_ADDK_I32
+
+    // --- description from .arch file ---
+    // D.i = D.i + signext(SIMM16);
+    // SCC = overflow.
+    void
+    Inst_SOPK__S_ADDK_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI16 simm16 = instData.SIMM16;
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = src.rawData() + (ScalarRegI32)sext<16>(simm16);
+        scc = (bits(src.rawData(), 31) == bits(simm16, 15)
+            && bits(src.rawData(), 31) != bits(sdst.rawData(), 31)) ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_MULK_I32 class methods ---
+
+    Inst_SOPK__S_MULK_I32::Inst_SOPK__S_MULK_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_mulk_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_MULK_I32
+
+    Inst_SOPK__S_MULK_I32::~Inst_SOPK__S_MULK_I32()
+    {
+    } // ~Inst_SOPK__S_MULK_I32
+
+    // --- description from .arch file ---
+    // D.i = D.i * signext(SIMM16).
+    void
+    Inst_SOPK__S_MULK_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI16 simm16 = instData.SIMM16;
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = src.rawData() * (ScalarRegI32)sext<16>(simm16);
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOPK__S_CBRANCH_I_FORK class methods ---
+
+    Inst_SOPK__S_CBRANCH_I_FORK::Inst_SOPK__S_CBRANCH_I_FORK(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cbranch_i_fork")
+    {
+        setFlag(Branch);
+    } // Inst_SOPK__S_CBRANCH_I_FORK
+
+    Inst_SOPK__S_CBRANCH_I_FORK::~Inst_SOPK__S_CBRANCH_I_FORK()
+    {
+    } // ~Inst_SOPK__S_CBRANCH_I_FORK
+
+    // --- description from .arch file ---
+    // mask_pass = S0.u64 & EXEC;
+    // mask_fail = ~S0.u64 & EXEC;
+    // target_addr = PC + signext(SIMM16 * 4) + 4;
+    // if (mask_pass == EXEC)
+    //     PC = target_addr;
+    // elsif (mask_fail == EXEC)
+    //     PC += 4;
+    // elsif (bitcount(mask_fail) < bitcount(mask_pass))
+    //     EXEC = mask_fail;
+    //     SGPR[CSP*4] = { target_addr, mask_pass };
+    //     CSP++;
+    //     PC += 4;
+    // else
+    //     EXEC = mask_pass;
+    //     SGPR[CSP*4] = { PC + 4, mask_fail };
+    //     CSP++;
+    //     PC = target_addr;
+    // end.
+    // Conditional branch using branch-stack.
+    // S0 = compare mask(vcc or any sgpr), and
+    // SIMM16 = signed DWORD branch offset relative to next instruction.
+    // See also S_CBRANCH_JOIN.
+    void
+    Inst_SOPK__S_CBRANCH_I_FORK::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPK__S_GETREG_B32 class methods ---
+
+    Inst_SOPK__S_GETREG_B32::Inst_SOPK__S_GETREG_B32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_getreg_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_GETREG_B32
+
+    Inst_SOPK__S_GETREG_B32::~Inst_SOPK__S_GETREG_B32()
+    {
+    } // ~Inst_SOPK__S_GETREG_B32
+
+    // --- description from .arch file ---
+    // D.u = hardware-reg. Read some or all of a hardware register into the
+    // LSBs of D.
+    // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
+    // is 1..32.
+    void
+    Inst_SOPK__S_GETREG_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI16 simm16 = instData.SIMM16;
+        ScalarRegU32 hwregId = simm16 & 0x3f;
+        ScalarRegU32 offset = (simm16 >> 6) & 31;
+        ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;
+
+        ScalarRegU32 hwreg =
+            gpuDynInst->computeUnit()->shader->getHwReg(hwregId);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        sdst.read();
+
+        // Store value from hardware to part of the SDST.
+        ScalarRegU32 mask = (((1U << size) - 1U) << offset);
+        sdst = (hwreg & mask) >> offset;
+        sdst.write();
+    } // execute
+    // --- Inst_SOPK__S_SETREG_B32 class methods ---
+
+    Inst_SOPK__S_SETREG_B32::Inst_SOPK__S_SETREG_B32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_setreg_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_SETREG_B32
+
+    Inst_SOPK__S_SETREG_B32::~Inst_SOPK__S_SETREG_B32()
+    {
+    } // ~Inst_SOPK__S_SETREG_B32
+
+    // --- description from .arch file ---
+    // hardware-reg = S0.u. Write some or all of the LSBs of D into a hardware
+    // register.
+    // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
+    // is 1..32.
+    void
+    Inst_SOPK__S_SETREG_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI16 simm16 = instData.SIMM16;
+        ScalarRegU32 hwregId = simm16 & 0x3f;
+        ScalarRegU32 offset = (simm16 >> 6) & 31;
+        ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;
+
+        ScalarRegU32 hwreg =
+            gpuDynInst->computeUnit()->shader->getHwReg(hwregId);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        sdst.read();
+
+        // Store value from SDST to part of the hardware register.
+        ScalarRegU32 mask = (((1U << size) - 1U) << offset);
+        hwreg = ((hwreg & ~mask) | ((sdst.rawData() << offset) & mask));
+        gpuDynInst->computeUnit()->shader->setHwReg(hwregId, hwreg);
+
+        // set MODE register to control the behavior of single precision
+        // floating-point numbers: denormal mode or round mode
+        if (hwregId==1 && size==2
+                        && (offset==4 || offset==0)) {
+            warn_once("Be cautious that s_setreg_b32 has no real effect "
+                            "on FP modes: %s\n", gpuDynInst->disassemble());
+            return;
+        }
+
+        // panic if not changing MODE of floating-point numbers
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPK__S_SETREG_IMM32_B32 class methods ---
+
+    Inst_SOPK__S_SETREG_IMM32_B32::Inst_SOPK__S_SETREG_IMM32_B32(
+          InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_setreg_imm32_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_SETREG_IMM32_B32
+
+    Inst_SOPK__S_SETREG_IMM32_B32::~Inst_SOPK__S_SETREG_IMM32_B32()
+    {
+    } // ~Inst_SOPK__S_SETREG_IMM32_B32
+
+    // --- description from .arch file ---
+    // Write some or all of the LSBs of IMM32 into a hardware register; this
+    // ---  instruction requires a 32-bit literal constant.
+    // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
+    // is 1..32.
+    void
+    Inst_SOPK__S_SETREG_IMM32_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI16 simm16 = instData.SIMM16;
+        ScalarRegU32 hwregId = simm16 & 0x3f;
+        ScalarRegU32 offset = (simm16 >> 6) & 31;
+        ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;
+
+        ScalarRegU32 hwreg =
+            gpuDynInst->computeUnit()->shader->getHwReg(hwregId);
+        ScalarRegI32 simm32 = extData.imm_u32;
+
+        // Store value from SIMM32 to part of the hardware register.
+        ScalarRegU32 mask = (((1U << size) - 1U) << offset);
+        hwreg = ((hwreg & ~mask) | ((simm32 << offset) & mask));
+        gpuDynInst->computeUnit()->shader->setHwReg(hwregId, hwreg);
+
+        // set MODE register to control the behavior of single precision
+        // floating-point numbers: denormal mode or round mode
+        if (hwregId==HW_REG_MODE && size==2
+                        && (offset==4 || offset==0)) {
+            warn_once("Be cautious that s_setreg_imm32_b32 has no real effect "
+                            "on FP modes: %s\n", gpuDynInst->disassemble());
+            return;
+        }
+
+        // panic if not changing modes of single-precision FPs
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/sopp.cc b/src/arch/amdgpu/vega/insts/sopp.cc
new file mode 100644
index 0000000000..df5cdbf681
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/sopp.cc
@@ -0,0 +1,900 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+#include "debug/GPUSync.hh"
+#include "gpu-compute/shader.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_SOPP__S_NOP class methods ---
+
+    Inst_SOPP__S_NOP::Inst_SOPP__S_NOP(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_nop")
+    {
+        setFlag(Nop);
+    } // Inst_SOPP__S_NOP
+
+    Inst_SOPP__S_NOP::~Inst_SOPP__S_NOP()
+    {
+    } // ~Inst_SOPP__S_NOP
+
+    // --- description from .arch file ---
+    // Do nothing. Repeat NOP 1..8 times based on SIMM16[2:0] -- 0 = 1 time,
+    // 7 = 8 times.
+    // This instruction may be used to introduce wait states to resolve
+    // hazards; see the shader programming guide for details. Compare with
+    // S_SLEEP.
+    void
+    Inst_SOPP__S_NOP::execute(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_SOPP__S_ENDPGM class methods ---
+
+    Inst_SOPP__S_ENDPGM::Inst_SOPP__S_ENDPGM(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_endpgm")
+    {
+        setFlag(EndOfKernel);
+    } // Inst_SOPP__S_ENDPGM
+
+    Inst_SOPP__S_ENDPGM::~Inst_SOPP__S_ENDPGM()
+    {
+    } // ~Inst_SOPP__S_ENDPGM
+
+    // --- description from .arch file ---
+    // End of program; terminate wavefront.
+    // The hardware implicitly executes S_WAITCNT 0 before executing this
+    // ---  instruction.
+    // See S_ENDPGM_SAVED for the context-switch version of this instruction.
+    void
+    Inst_SOPP__S_ENDPGM::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ComputeUnit *cu = gpuDynInst->computeUnit();
+
+        // delete extra instructions fetched for completed work-items
+        wf->instructionBuffer.erase(wf->instructionBuffer.begin() + 1,
+            wf->instructionBuffer.end());
+
+        if (wf->pendingFetch) {
+            wf->dropFetch = true;
+        }
+
+        wf->computeUnit->fetchStage.fetchUnit(wf->simdId)
+            .flushBuf(wf->wfSlotId);
+        wf->setStatus(Wavefront::S_STOPPED);
+
+        int refCount = wf->computeUnit->getLds()
+            .decreaseRefCounter(wf->dispatchId, wf->wgId);
+
+        /**
+         * The parent WF of this instruction is exiting, therefore
+         * it should not participate in this barrier any longer. This
+         * prevents possible deadlock issues if WFs exit early.
+         */
+        int bar_id = WFBarrier::InvalidID;
+        if (wf->hasBarrier()) {
+            assert(wf->getStatus() != Wavefront::S_BARRIER);
+            bar_id = wf->barrierId();
+            assert(bar_id != WFBarrier::InvalidID);
+            wf->releaseBarrier();
+            cu->decMaxBarrierCnt(bar_id);
+            DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Exiting the "
+                    "program and decrementing max barrier count for "
+                    "barrier Id%d. New max count: %d.\n", cu->cu_id,
+                    wf->simdId, wf->wfSlotId, wf->wfDynId, bar_id,
+                    cu->maxBarrierCnt(bar_id));
+        }
+
+        DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
+            wf->computeUnit->cu_id, wf->wgId, refCount);
+
+        wf->computeUnit->registerManager->freeRegisters(wf);
+        wf->computeUnit->stats.completedWfs++;
+        wf->computeUnit->activeWaves--;
+
+        panic_if(wf->computeUnit->activeWaves < 0, "CU[%d] Active waves less "
+            "than zero\n", wf->computeUnit->cu_id);
+
+        DPRINTF(GPUExec, "Doing return for CU%d: WF[%d][%d][%d]\n",
+            wf->computeUnit->cu_id, wf->simdId, wf->wfSlotId, wf->wfDynId);
+
+        for (int i = 0; i < wf->vecReads.size(); i++) {
+            if (wf->rawDist.find(i) != wf->rawDist.end()) {
+                wf->stats.readsPerWrite.sample(wf->vecReads.at(i));
+            }
+        }
+        wf->vecReads.clear();
+        wf->rawDist.clear();
+        wf->lastInstExec = 0;
+
+        if (!refCount) {
+            /**
+             * If all WFs have finished, and hence the WG has finished,
+             * then we can free up the barrier belonging to the parent
+             * WG, but only if we actually used a barrier (i.e., more
+             * than one WF in the WG).
+             */
+            if (bar_id != WFBarrier::InvalidID) {
+                DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - All waves are "
+                        "now complete. Releasing barrier Id%d.\n", cu->cu_id,
+                        wf->simdId, wf->wfSlotId, wf->wfDynId,
+                        wf->barrierId());
+                cu->releaseBarrier(bar_id);
+            }
+
+           /**
+             * Last wavefront of the workgroup has executed return. If the
+             * workgroup is not the final one in the kernel, then simply
+             * retire it; however, if it is the final one, i.e., indicating
+             * the kernel end, then release operation (i.e., GL2 WB) is
+             * needed
+             */
+
+            //check whether the workgroup is indicating the kernel end, i.e.,
+            //the last workgroup in the kernel
+            bool kernelEnd =
+                wf->computeUnit->shader->dispatcher().isReachingKernelEnd(wf);
+
+            bool relNeeded =
+                wf->computeUnit->shader->impl_kern_end_rel;
+
+            //if it is not a kernel end, then retire the workgroup directly
+            if (!kernelEnd || !relNeeded) {
+                wf->computeUnit->shader->dispatcher().notifyWgCompl(wf);
+                wf->setStatus(Wavefront::S_STOPPED);
+                wf->computeUnit->stats.completedWGs++;
+
+                return;
+            }
+
+            /**
+             * if it is a kernel end, inject a memory sync, i.e., GL2 WB, and
+             * retire the workgroup after receving response.
+             * note that GL0V and GL1 are read only, and they just forward GL2
+             * WB request. When forwarding, GL1 send the request to all GL2 in
+             * the complex
+             */
+            setFlag(MemSync);
+            setFlag(GlobalSegment);
+            // Notify Memory System of Kernel Completion
+            // Kernel End = isKernel + isMemSync
+            wf->setStatus(Wavefront::S_RETURNING);
+            gpuDynInst->simdId = wf->simdId;
+            gpuDynInst->wfSlotId = wf->wfSlotId;
+            gpuDynInst->wfDynId = wf->wfDynId;
+
+            DPRINTF(GPUExec, "inject global memory fence for CU%d: "
+                            "WF[%d][%d][%d]\n", wf->computeUnit->cu_id,
+                            wf->simdId, wf->wfSlotId, wf->wfDynId);
+
+            // call shader to prepare the flush operations
+            wf->computeUnit->shader->prepareFlush(gpuDynInst);
+
+            wf->computeUnit->stats.completedWGs++;
+        } else {
+            wf->computeUnit->shader->dispatcher().scheduleDispatch();
+        }
+    } // execute
+
+    // --- Inst_SOPP__S_BRANCH class methods ---
+
+    Inst_SOPP__S_BRANCH::Inst_SOPP__S_BRANCH(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_branch")
+    {
+        setFlag(Branch);
+    } // Inst_SOPP__S_BRANCH
+
+    Inst_SOPP__S_BRANCH::~Inst_SOPP__S_BRANCH()
+    {
+    } // ~Inst_SOPP__S_BRANCH
+
+    // --- description from .arch file ---
+    // PC = PC + signext(SIMM16 * 4) + 4 (short jump).
+    // For a long jump, use S_SETPC.
+    void
+    Inst_SOPP__S_BRANCH::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        Addr pc = gpuDynInst->pc();
+        ScalarRegI16 simm16 = instData.SIMM16;
+
+        pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
+
+        wf->pc(pc);
+    } // execute
+    // --- Inst_SOPP__S_WAKEUP class methods ---
+
+    Inst_SOPP__S_WAKEUP::Inst_SOPP__S_WAKEUP(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_wakeup")
+    {
+    } // Inst_SOPP__S_WAKEUP
+
+    Inst_SOPP__S_WAKEUP::~Inst_SOPP__S_WAKEUP()
+    {
+    } // ~Inst_SOPP__S_WAKEUP
+
+    // --- description from .arch file ---
+    // Allow a wave to 'ping' all the other waves in its threadgroup to force
+    // them to wake up immediately from an S_SLEEP instruction. The ping is
+    // ignored if the waves are not sleeping.
+    // This allows for more efficient polling on a memory location. The waves
+    // which are polling can sit in a long S_SLEEP between memory reads, but
+    // the wave which writes the value can tell them all to wake up early now
+    // that the data is available. This is useful for fBarrier implementations
+    // (speedup).
+    // This method is also safe from races because if any wave misses the ping,
+    // everything still works fine (whoever missed it just completes their
+    // normal S_SLEEP).
+    void
+    Inst_SOPP__S_WAKEUP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_SCC0 class methods ---
+
+    Inst_SOPP__S_CBRANCH_SCC0::Inst_SOPP__S_CBRANCH_SCC0(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_scc0")
+    {
+        setFlag(Branch);
+    } // Inst_SOPP__S_CBRANCH_SCC0
+
+    Inst_SOPP__S_CBRANCH_SCC0::~Inst_SOPP__S_CBRANCH_SCC0()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_SCC0
+
+    // --- description from .arch file ---
+    // if (SCC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_SCC0::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        Addr pc = gpuDynInst->pc();
+        ScalarRegI16 simm16 = instData.SIMM16;
+        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        scc.read();
+
+        if (!scc.rawData()) {
+            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
+        }
+
+        wf->pc(pc);
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_SCC1 class methods ---
+
+    Inst_SOPP__S_CBRANCH_SCC1::Inst_SOPP__S_CBRANCH_SCC1(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_scc1")
+    {
+        setFlag(Branch);
+    } // Inst_SOPP__S_CBRANCH_SCC1
+
+    Inst_SOPP__S_CBRANCH_SCC1::~Inst_SOPP__S_CBRANCH_SCC1()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_SCC1
+
+    // --- description from .arch file ---
+    // if (SCC == 1) then PC = PC + signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_SCC1::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        Addr pc = gpuDynInst->pc();
+        ScalarRegI16 simm16 = instData.SIMM16;
+        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        scc.read();
+
+        if (scc.rawData()) {
+            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
+        }
+
+        wf->pc(pc);
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_VCCZ class methods ---
+
+    Inst_SOPP__S_CBRANCH_VCCZ::Inst_SOPP__S_CBRANCH_VCCZ(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_vccz")
+    {
+        setFlag(Branch);
+        setFlag(ReadsVCC);
+    } // Inst_SOPP__S_CBRANCH_VCCZ
+
+    Inst_SOPP__S_CBRANCH_VCCZ::~Inst_SOPP__S_CBRANCH_VCCZ()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_VCCZ
+
+    // --- description from .arch file ---
+    // if (VCC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_VCCZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+        Addr pc = gpuDynInst->pc();
+        ScalarRegI16 simm16 = instData.SIMM16;
+
+        vcc.read();
+
+        if (!vcc.rawData()) {
+            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
+        }
+
+        wf->pc(pc);
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_VCCNZ class methods ---
+
+    Inst_SOPP__S_CBRANCH_VCCNZ::Inst_SOPP__S_CBRANCH_VCCNZ(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_vccnz")
+    {
+        setFlag(Branch);
+        setFlag(ReadsVCC);
+    } // Inst_SOPP__S_CBRANCH_VCCNZ
+
+    Inst_SOPP__S_CBRANCH_VCCNZ::~Inst_SOPP__S_CBRANCH_VCCNZ()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_VCCNZ
+
+    // --- description from .arch file ---
+    // if (VCC != 0) then PC = PC + signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_VCCNZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        vcc.read();
+
+        if (vcc.rawData()) {
+            Addr pc = gpuDynInst->pc();
+            ScalarRegI16 simm16 = instData.SIMM16;
+            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
+            wf->pc(pc);
+        }
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_EXECZ class methods ---
+
+    Inst_SOPP__S_CBRANCH_EXECZ::Inst_SOPP__S_CBRANCH_EXECZ(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_execz")
+    {
+        setFlag(Branch);
+        setFlag(ReadsEXEC);
+    } // Inst_SOPP__S_CBRANCH_EXECZ
+
+    Inst_SOPP__S_CBRANCH_EXECZ::~Inst_SOPP__S_CBRANCH_EXECZ()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_EXECZ
+
+    // --- description from .arch file ---
+    // if (EXEC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_EXECZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (wf->execMask().none()) {
+            Addr pc = gpuDynInst->pc();
+            ScalarRegI16 simm16 = instData.SIMM16;
+            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
+            wf->pc(pc);
+        }
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_EXECNZ class methods ---
+
+    Inst_SOPP__S_CBRANCH_EXECNZ::Inst_SOPP__S_CBRANCH_EXECNZ(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_execnz")
+    {
+        setFlag(Branch);
+        setFlag(ReadsEXEC);
+    } // Inst_SOPP__S_CBRANCH_EXECNZ
+
+    Inst_SOPP__S_CBRANCH_EXECNZ::~Inst_SOPP__S_CBRANCH_EXECNZ()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_EXECNZ
+
+    // --- description from .arch file ---
+    // if (EXEC != 0) then PC = PC + signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_EXECNZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (wf->execMask().any()) {
+            Addr pc = gpuDynInst->pc();
+            ScalarRegI16 simm16 = instData.SIMM16;
+            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
+            wf->pc(pc);
+        }
+    } // execute
+    // --- Inst_SOPP__S_BARRIER class methods ---
+
+    Inst_SOPP__S_BARRIER::Inst_SOPP__S_BARRIER(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_barrier")
+    {
+        setFlag(MemBarrier);
+    } // Inst_SOPP__S_BARRIER
+
+    Inst_SOPP__S_BARRIER::~Inst_SOPP__S_BARRIER()
+    {
+    } // ~Inst_SOPP__S_BARRIER
+
+    // --- description from .arch file ---
+    // Synchronize waves within a threadgroup.
+    // If not all waves of the threadgroup have been created yet, waits for
+    // entire group before proceeding.
+    // If some waves in the threadgroup have already terminated, this waits on
+    // only the surviving waves.
+    // Barriers are legal inside trap handlers.
+    void
+    Inst_SOPP__S_BARRIER::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ComputeUnit *cu = gpuDynInst->computeUnit();
+
+        if (wf->hasBarrier()) {
+            int bar_id = wf->barrierId();
+            assert(wf->getStatus() == Wavefront::S_BARRIER);
+            cu->incNumAtBarrier(bar_id);
+            DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Stalling at "
+                    "barrier Id%d. %d waves now at barrier, %d waves "
+                    "remain.\n", cu->cu_id, wf->simdId, wf->wfSlotId,
+                    wf->wfDynId, bar_id, cu->numAtBarrier(bar_id),
+                    cu->numYetToReachBarrier(bar_id));
+        }
+    } // execute
+    // --- Inst_SOPP__S_SETKILL class methods ---
+
+    Inst_SOPP__S_SETKILL::Inst_SOPP__S_SETKILL(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_setkill")
+    {
+    } // Inst_SOPP__S_SETKILL
+
+    Inst_SOPP__S_SETKILL::~Inst_SOPP__S_SETKILL()
+    {
+    } // ~Inst_SOPP__S_SETKILL
+
+    // --- description from .arch file ---
+    // set KILL bit to value of SIMM16[0].
+    // Used primarily for debugging kill wave host command behavior.
+    void
+    Inst_SOPP__S_SETKILL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_WAITCNT class methods ---
+
+    Inst_SOPP__S_WAITCNT::Inst_SOPP__S_WAITCNT(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_waitcnt")
+    {
+        setFlag(ALU);
+        setFlag(Waitcnt);
+    } // Inst_SOPP__S_WAITCNT
+
+    Inst_SOPP__S_WAITCNT::~Inst_SOPP__S_WAITCNT()
+    {
+    } // ~Inst_SOPP__S_WAITCNT
+
+    // --- description from .arch file ---
+    // Wait for the counts of outstanding lds, vector-memory and
+    // ---  export/vmem-write-data to be at or below the specified levels.
+    // SIMM16[3:0] = vmcount (vector memory operations),
+    // SIMM16[6:4] = export/mem-write-data count,
+    // SIMM16[12:8] = LGKM_cnt (scalar-mem/GDS/LDS count).
+    void
+    Inst_SOPP__S_WAITCNT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 vm_cnt = 0;
+        ScalarRegI32 exp_cnt = 0;
+        ScalarRegI32 lgkm_cnt = 0;
+        vm_cnt = bits<ScalarRegI16>(instData.SIMM16, 3, 0);
+        exp_cnt = bits<ScalarRegI16>(instData.SIMM16, 6, 4);
+        lgkm_cnt = bits<ScalarRegI16>(instData.SIMM16, 12, 8);
+        gpuDynInst->wavefront()->setStatus(Wavefront::S_WAITCNT);
+        gpuDynInst->wavefront()->setWaitCnts(vm_cnt, exp_cnt, lgkm_cnt);
+    } // execute
+    // --- Inst_SOPP__S_SETHALT class methods ---
+
+    Inst_SOPP__S_SETHALT::Inst_SOPP__S_SETHALT(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_sethalt")
+    {
+    } // Inst_SOPP__S_SETHALT
+
+    Inst_SOPP__S_SETHALT::~Inst_SOPP__S_SETHALT()
+    {
+    } // ~Inst_SOPP__S_SETHALT
+
+    // --- description from .arch file ---
+    // Set HALT bit to value of SIMM16[0]; 1 = halt, 0 = resume.
+    // The halt flag is ignored while PRIV == 1 (inside trap handlers) but the
+    // shader will halt immediately after the handler returns if HALT is still
+    // set at that time.
+    void
+    Inst_SOPP__S_SETHALT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_SLEEP class methods ---
+
+    Inst_SOPP__S_SLEEP::Inst_SOPP__S_SLEEP(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_sleep")
+    {
+        setFlag(ALU);
+        setFlag(Sleep);
+    } // Inst_SOPP__S_SLEEP
+
+    Inst_SOPP__S_SLEEP::~Inst_SOPP__S_SLEEP()
+    {
+    } // ~Inst_SOPP__S_SLEEP
+
+    // --- description from .arch file ---
+    // Cause a wave to sleep for (64 * SIMM16[2:0] + 1..64) clocks.
+    // The exact amount of delay is approximate. Compare with S_NOP.
+    void
+    Inst_SOPP__S_SLEEP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)instData.SIMM16;
+        gpuDynInst->wavefront()->setStatus(Wavefront::S_STALLED_SLEEP);
+        // sleep duration is specified in multiples of 64 cycles
+        gpuDynInst->wavefront()->setSleepTime(64 * simm16);
+    } // execute
+    // --- Inst_SOPP__S_SETPRIO class methods ---
+
+    Inst_SOPP__S_SETPRIO::Inst_SOPP__S_SETPRIO(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_setprio")
+    {
+        setFlag(ALU);
+    } // Inst_SOPP__S_SETPRIO
+
+    Inst_SOPP__S_SETPRIO::~Inst_SOPP__S_SETPRIO()
+    {
+    } // ~Inst_SOPP__S_SETPRIO
+
+    // --- description from .arch file ---
+    // User settable wave priority is set to SIMM16[1:0]. 0 = lowest,
+    // 3 = highest.
+    // The overall wave priority is {SPIPrio[1:0] + UserPrio[1:0],
+    // WaveAge[3:0]}.
+    void
+    Inst_SOPP__S_SETPRIO::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegU16 simm16 = instData.SIMM16;
+        ScalarRegU32 userPrio = simm16 & 0x3;
+
+        warn_once("S_SETPRIO ignored -- Requested priority %d\n", userPrio);
+    } // execute
+    // --- Inst_SOPP__S_SENDMSG class methods ---
+
+    Inst_SOPP__S_SENDMSG::Inst_SOPP__S_SENDMSG(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_sendmsg")
+    {
+    } // Inst_SOPP__S_SENDMSG
+
+    Inst_SOPP__S_SENDMSG::~Inst_SOPP__S_SENDMSG()
+    {
+    } // ~Inst_SOPP__S_SENDMSG
+
+    // --- description from .arch file ---
+    // Send a message upstream to VGT or the interrupt handler.
+    // SIMM16[9:0] contains the message type and is documented in the shader
+    // ---  programming guide.
+    void
+    Inst_SOPP__S_SENDMSG::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_SENDMSGHALT class methods ---
+
+    Inst_SOPP__S_SENDMSGHALT::Inst_SOPP__S_SENDMSGHALT(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_sendmsghalt")
+    {
+    } // Inst_SOPP__S_SENDMSGHALT
+
+    Inst_SOPP__S_SENDMSGHALT::~Inst_SOPP__S_SENDMSGHALT()
+    {
+    } // ~Inst_SOPP__S_SENDMSGHALT
+
+    // --- description from .arch file ---
+    // Send a message and then HALT the wavefront; see S_SENDMSG for details.
+    void
+    Inst_SOPP__S_SENDMSGHALT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_TRAP class methods ---
+
+    Inst_SOPP__S_TRAP::Inst_SOPP__S_TRAP(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_trap")
+    {
+    } // Inst_SOPP__S_TRAP
+
+    Inst_SOPP__S_TRAP::~Inst_SOPP__S_TRAP()
+    {
+    } // ~Inst_SOPP__S_TRAP
+
+    // --- description from .arch file ---
+    // TrapID = SIMM16[7:0];
+    // Wait for all instructions to complete;
+    // set {TTMP1, TTMP0} = {3'h0, PCRewind[3:0], HT[0], TrapID[7:0],
+    // PC[47:0]};
+    // PC = TBA (trap base address);
+    // PRIV = 1.
+    // Enter the trap handler. This instruction may be generated internally as
+    // well in response to a host trap (HT = 1) or an exception.
+    // TrapID 0 is reserved for hardware use and should not be used in a
+    // shader-generated trap.
+    void
+    Inst_SOPP__S_TRAP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_ICACHE_INV class methods ---
+
+    Inst_SOPP__S_ICACHE_INV::Inst_SOPP__S_ICACHE_INV(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_icache_inv")
+    {
+    } // Inst_SOPP__S_ICACHE_INV
+
+    Inst_SOPP__S_ICACHE_INV::~Inst_SOPP__S_ICACHE_INV()
+    {
+    } // ~Inst_SOPP__S_ICACHE_INV
+
+    // --- description from .arch file ---
+    // Invalidate entire L1 instruction cache.
+    // You must have 12 separate S_NOP instructions or a jump/branch
+    // instruction after this instruction
+    // to ensure the SQ instruction buffer is purged.
+    void
+    Inst_SOPP__S_ICACHE_INV::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_INCPERFLEVEL class methods ---
+
+    Inst_SOPP__S_INCPERFLEVEL::Inst_SOPP__S_INCPERFLEVEL(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_incperflevel")
+    {
+    } // Inst_SOPP__S_INCPERFLEVEL
+
+    Inst_SOPP__S_INCPERFLEVEL::~Inst_SOPP__S_INCPERFLEVEL()
+    {
+    } // ~Inst_SOPP__S_INCPERFLEVEL
+
+    // --- description from .arch file ---
+    // Increment performance counter specified in SIMM16[3:0] by 1.
+    void
+    Inst_SOPP__S_INCPERFLEVEL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_DECPERFLEVEL class methods ---
+
+    Inst_SOPP__S_DECPERFLEVEL::Inst_SOPP__S_DECPERFLEVEL(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_decperflevel")
+    {
+    } // Inst_SOPP__S_DECPERFLEVEL
+
+    Inst_SOPP__S_DECPERFLEVEL::~Inst_SOPP__S_DECPERFLEVEL()
+    {
+    } // ~Inst_SOPP__S_DECPERFLEVEL
+
+    // --- description from .arch file ---
+    // Decrement performance counter specified in SIMM16[3:0] by 1.
+    void
+    Inst_SOPP__S_DECPERFLEVEL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_TTRACEDATA class methods ---
+
+    Inst_SOPP__S_TTRACEDATA::Inst_SOPP__S_TTRACEDATA(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_ttracedata")
+    {
+    } // Inst_SOPP__S_TTRACEDATA
+
+    Inst_SOPP__S_TTRACEDATA::~Inst_SOPP__S_TTRACEDATA()
+    {
+    } // ~Inst_SOPP__S_TTRACEDATA
+
+    // --- description from .arch file ---
+    // Send M0 as user data to the thread trace stream.
+    void
+    Inst_SOPP__S_TTRACEDATA::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_CDBGSYS class methods ---
+
+    Inst_SOPP__S_CBRANCH_CDBGSYS::Inst_SOPP__S_CBRANCH_CDBGSYS(
+          InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_cdbgsys")
+    {
+        setFlag(Branch);
+    } // Inst_SOPP__S_CBRANCH_CDBGSYS
+
+    Inst_SOPP__S_CBRANCH_CDBGSYS::~Inst_SOPP__S_CBRANCH_CDBGSYS()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_CDBGSYS
+
+    // --- description from .arch file ---
+    // if (conditional_debug_system != 0) then PC = PC + signext(SIMM16 * 4)
+    // + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_CDBGSYS::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_CDBGUSER class methods ---
+
+    Inst_SOPP__S_CBRANCH_CDBGUSER::Inst_SOPP__S_CBRANCH_CDBGUSER(
+          InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_cdbguser")
+    {
+        setFlag(Branch);
+    } // Inst_SOPP__S_CBRANCH_CDBGUSER
+
+    Inst_SOPP__S_CBRANCH_CDBGUSER::~Inst_SOPP__S_CBRANCH_CDBGUSER()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_CDBGUSER
+
+    // --- description from .arch file ---
+    // if (conditional_debug_user != 0) then PC = PC + signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_CDBGUSER::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER class methods ---
+
+    Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER(
+          InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_cdbgsys_or_user")
+    {
+        setFlag(Branch);
+    } // Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER
+
+    Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::
+        ~Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER
+
+    // --- description from .arch file ---
+    // if (conditional_debug_system || conditional_debug_user) then PC = PC +
+    // ---  signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER class methods ---
+
+    Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::
+        Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER(InFmt_SOPP *iFmt)
+            : Inst_SOPP(iFmt, "s_cbranch_cdbgsys_and_user")
+    {
+        setFlag(Branch);
+    } // Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER
+
+    Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::
+        ~Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER
+
+    // --- description from .arch file ---
+    // if (conditional_debug_system && conditional_debug_user) then PC = PC +
+    // ---  signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_ENDPGM_SAVED class methods ---
+
+    Inst_SOPP__S_ENDPGM_SAVED::Inst_SOPP__S_ENDPGM_SAVED(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_endpgm_saved")
+    {
+    } // Inst_SOPP__S_ENDPGM_SAVED
+
+    Inst_SOPP__S_ENDPGM_SAVED::~Inst_SOPP__S_ENDPGM_SAVED()
+    {
+    } // ~Inst_SOPP__S_ENDPGM_SAVED
+
+    // --- description from .arch file ---
+    // End of program; signal that a wave has been saved by the context-switch
+    // trap handler and terminate wavefront.
+    // The hardware implicitly executes S_WAITCNT 0 before executing this
+    // instruction.
+    // Use S_ENDPGM in all cases unless you are executing the context-switch
+    // save handler.
+    void
+    Inst_SOPP__S_ENDPGM_SAVED::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_SET_GPR_IDX_OFF class methods ---
+
+    Inst_SOPP__S_SET_GPR_IDX_OFF::Inst_SOPP__S_SET_GPR_IDX_OFF(
+          InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_set_gpr_idx_off")
+    {
+    } // Inst_SOPP__S_SET_GPR_IDX_OFF
+
+    Inst_SOPP__S_SET_GPR_IDX_OFF::~Inst_SOPP__S_SET_GPR_IDX_OFF()
+    {
+    } // ~Inst_SOPP__S_SET_GPR_IDX_OFF
+
+    // --- description from .arch file ---
+    // MODE.gpr_idx_en = 0.
+    // Clear GPR indexing mode. Vector operations after this will not perform
+    // ---  relative GPR addressing regardless of the contents of M0. This
+    // ---  instruction does not modify M0.
+    void
+    Inst_SOPP__S_SET_GPR_IDX_OFF::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_SET_GPR_IDX_MODE class methods ---
+
+    Inst_SOPP__S_SET_GPR_IDX_MODE::Inst_SOPP__S_SET_GPR_IDX_MODE(
+          InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_set_gpr_idx_mode")
+    {
+    } // Inst_SOPP__S_SET_GPR_IDX_MODE
+
+    Inst_SOPP__S_SET_GPR_IDX_MODE::~Inst_SOPP__S_SET_GPR_IDX_MODE()
+    {
+    } // ~Inst_SOPP__S_SET_GPR_IDX_MODE
+
+    // --- description from .arch file ---
+    // M0[15:12] = SIMM4.
+    // Modify the mode used for vector GPR indexing.
+    // The raw contents of the source field are read and used to set the enable
+    // bits. SIMM4[0] = VSRC0_REL, SIMM4[1] = VSRC1_REL, SIMM4[2] = VSRC2_REL
+    // and SIMM4[3] = VDST_REL.
+    void
+    Inst_SOPP__S_SET_GPR_IDX_MODE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/vinterp.cc b/src/arch/amdgpu/vega/insts/vinterp.cc
new file mode 100644
index 0000000000..784f6f2eb2
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/vinterp.cc
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_VINTRP__V_INTERP_P1_F32 class methods ---
+
+    Inst_VINTRP__V_INTERP_P1_F32::Inst_VINTRP__V_INTERP_P1_F32(
+          InFmt_VINTRP *iFmt)
+        : Inst_VINTRP(iFmt, "v_interp_p1_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VINTRP__V_INTERP_P1_F32
+
+    Inst_VINTRP__V_INTERP_P1_F32::~Inst_VINTRP__V_INTERP_P1_F32()
+    {
+    } // ~Inst_VINTRP__V_INTERP_P1_F32
+
+    // --- description from .arch file ---
+    // D.f = P10 * S.f + P0; parameter interpolation (SQ translates to
+    // V_MAD_F32 for SP).
+    // CAUTION: when in HALF_LDS mode, D must not be the same GPR as S;
+    // if D == S then data corruption will occur.
+    // NOTE: In textual representations the I/J VGPR is the first source and
+    // the attribute is the second source; however in the VOP3 encoding the
+    // attribute is stored in the src0 field and the VGPR is stored in the
+    // src1 field.
+    void
+    Inst_VINTRP__V_INTERP_P1_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VINTRP__V_INTERP_P2_F32 class methods ---
+
+    Inst_VINTRP__V_INTERP_P2_F32::Inst_VINTRP__V_INTERP_P2_F32(
+          InFmt_VINTRP *iFmt)
+        : Inst_VINTRP(iFmt, "v_interp_p2_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VINTRP__V_INTERP_P2_F32
+
+    Inst_VINTRP__V_INTERP_P2_F32::~Inst_VINTRP__V_INTERP_P2_F32()
+    {
+    } // ~Inst_VINTRP__V_INTERP_P2_F32
+
+    // --- description from .arch file ---
+    // D.f = P20 * S.f + D.f; parameter interpolation (SQ translates to
+    // V_MAD_F32 for SP).
+    // NOTE: In textual representations the I/J VGPR is the first source and
+    // the attribute is the second source; however in the VOP3 encoding the
+    // attribute is stored in the src0 field and the VGPR is stored in the
+    // src1 field.
+    void
+    Inst_VINTRP__V_INTERP_P2_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VINTRP__V_INTERP_MOV_F32 class methods ---
+
+    Inst_VINTRP__V_INTERP_MOV_F32::Inst_VINTRP__V_INTERP_MOV_F32(
+          InFmt_VINTRP *iFmt)
+        : Inst_VINTRP(iFmt, "v_interp_mov_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VINTRP__V_INTERP_MOV_F32
+
+    Inst_VINTRP__V_INTERP_MOV_F32::~Inst_VINTRP__V_INTERP_MOV_F32()
+    {
+    } // ~Inst_VINTRP__V_INTERP_MOV_F32
+
+    // --- description from .arch file ---
+    // D.f = {P10,P20,P0}[S.u]; parameter load.
+    void
+    Inst_VINTRP__V_INTERP_MOV_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/vop1.cc b/src/arch/amdgpu/vega/insts/vop1.cc
new file mode 100644
index 0000000000..fc41c0ae78
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/vop1.cc
@@ -0,0 +1,2340 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/inst_util.hh"
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_VOP1__V_NOP class methods ---
+
+    Inst_VOP1__V_NOP::Inst_VOP1__V_NOP(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_nop")
+    {
+        setFlag(Nop);
+        setFlag(ALU);
+    } // Inst_VOP1__V_NOP
+
+    Inst_VOP1__V_NOP::~Inst_VOP1__V_NOP()
+    {
+    } // ~Inst_VOP1__V_NOP
+
+    // --- description from .arch file ---
+    // Do nothing.
+    void
+    Inst_VOP1__V_NOP::execute(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_VOP1__V_MOV_B32 class methods ---
+
+    Inst_VOP1__V_MOV_B32::Inst_VOP1__V_MOV_B32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_mov_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_MOV_B32
+
+    Inst_VOP1__V_MOV_B32::~Inst_VOP1__V_MOV_B32()
+    {
+    } // ~Inst_VOP1__V_MOV_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u.
+    // Input and output modifiers not supported; this is an untyped operation.
+    void
+    Inst_VOP1__V_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (isDPPInst()) {
+            VecOperandU32 src_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
+            src_dpp.read();
+
+            DPRINTF(VEGA, "Handling V_MOV_B32 SRC DPP. SRC0: register v[%d], "
+                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
+                    "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
+                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
+                    extData.iFmt_VOP_DPP.DPP_CTRL,
+                    extData.iFmt_VOP_DPP.SRC0_ABS,
+                    extData.iFmt_VOP_DPP.SRC0_NEG,
+                    extData.iFmt_VOP_DPP.SRC1_ABS,
+                    extData.iFmt_VOP_DPP.SRC1_NEG,
+                    extData.iFmt_VOP_DPP.BC,
+                    extData.iFmt_VOP_DPP.BANK_MASK,
+                    extData.iFmt_VOP_DPP.ROW_MASK);
+
+            // NOTE: For VOP1, there is no SRC1, so make sure we're not trying
+            // to negate it or take the absolute value of it
+            assert(!extData.iFmt_VOP_DPP.SRC1_ABS);
+            assert(!extData.iFmt_VOP_DPP.SRC1_NEG);
+            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src_dpp);
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src_dpp[lane];
+                }
+            }
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_READFIRSTLANE_B32 class methods ---
+
+    Inst_VOP1__V_READFIRSTLANE_B32::Inst_VOP1__V_READFIRSTLANE_B32(
+          InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_readfirstlane_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_READFIRSTLANE_B32
+
+    Inst_VOP1__V_READFIRSTLANE_B32::~Inst_VOP1__V_READFIRSTLANE_B32()
+    {
+    } // ~Inst_VOP1__V_READFIRSTLANE_B32
+
+    // --- description from .arch file ---
+    // Copy one VGPR value to one SGPR. D = SGPR destination, S0 = source data
+    // (VGPR# or M0 for lds direct access), Lane# = FindFirst1fromLSB(exec)
+    // (Lane# = 0 if exec is zero). Ignores exec mask for the access. SQ
+    // translates to V_READLANE_B32.
+    // Input and output modifiers not supported; this is an untyped operation.
+    void
+    Inst_VOP1__V_READFIRSTLANE_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarRegI32 src_lane(0);
+        ScalarRegU64 exec_mask = wf->execMask().to_ullong();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (exec_mask) {
+            src_lane = findLsbSet(exec_mask);
+        }
+
+        sdst = src[src_lane];
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_I32_F64 class methods ---
+
+    Inst_VOP1__V_CVT_I32_F64::Inst_VOP1__V_CVT_I32_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_i32_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_CVT_I32_F64
+
+    Inst_VOP1__V_CVT_I32_F64::~Inst_VOP1__V_CVT_I32_F64()
+    {
+    } // ~Inst_VOP1__V_CVT_I32_F64
+
+    // --- description from .arch file ---
+    // D.i = (int)S0.d.
+    // Out-of-range floating point values (including infinity) saturate. NaN is
+    // ---  converted to 0.
+    void
+    Inst_VOP1__V_CVT_I32_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int exp;
+                std::frexp(src[lane],&exp);
+                if (std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else if (std::isinf(src[lane]) || exp > 30) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = INT_MIN;
+                    } else {
+                        vdst[lane] = INT_MAX;
+                    }
+                } else {
+                    vdst[lane] = (VecElemI32)src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F64_I32 class methods ---
+
+    Inst_VOP1__V_CVT_F64_I32::Inst_VOP1__V_CVT_F64_I32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f64_i32")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_CVT_F64_I32
+
+    Inst_VOP1__V_CVT_F64_I32::~Inst_VOP1__V_CVT_F64_I32()
+    {
+    } // ~Inst_VOP1__V_CVT_F64_I32
+
+    // --- description from .arch file ---
+    // D.d = (double)S0.i.
+    void
+    Inst_VOP1__V_CVT_F64_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF64)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F32_I32 class methods ---
+
+    Inst_VOP1__V_CVT_F32_I32::Inst_VOP1__V_CVT_F32_I32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f32_i32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_F32_I32
+
+    Inst_VOP1__V_CVT_F32_I32::~Inst_VOP1__V_CVT_F32_I32()
+    {
+    } // ~Inst_VOP1__V_CVT_F32_I32
+
+    // --- description from .arch file ---
+    // D.f = (float)S0.i.
+    void
+    Inst_VOP1__V_CVT_F32_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F32_U32 class methods ---
+
+    Inst_VOP1__V_CVT_F32_U32::Inst_VOP1__V_CVT_F32_U32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f32_u32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_F32_U32
+
+    Inst_VOP1__V_CVT_F32_U32::~Inst_VOP1__V_CVT_F32_U32()
+    {
+    } // ~Inst_VOP1__V_CVT_F32_U32
+
+    // --- description from .arch file ---
+    // D.f = (float)S0.u.
+    void
+    Inst_VOP1__V_CVT_F32_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_U32_F32 class methods ---
+
+    Inst_VOP1__V_CVT_U32_F32::Inst_VOP1__V_CVT_U32_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_u32_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_U32_F32
+
+    Inst_VOP1__V_CVT_U32_F32::~Inst_VOP1__V_CVT_U32_F32()
+    {
+    } // ~Inst_VOP1__V_CVT_U32_F32
+
+    // --- description from .arch file ---
+    // D.u = (unsigned)S0.f.
+    // Out-of-range floating point values (including infinity) saturate. NaN is
+    // ---  converted to 0.
+    void
+    Inst_VOP1__V_CVT_U32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int exp;
+                std::frexp(src[lane],&exp);
+                if (std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else if (std::isinf(src[lane])) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = 0;
+                    } else {
+                        vdst[lane] = UINT_MAX;
+                    }
+                } else if (exp > 31) {
+                    vdst[lane] = UINT_MAX;
+                } else {
+                    vdst[lane] = (VecElemU32)src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_I32_F32 class methods ---
+
+    Inst_VOP1__V_CVT_I32_F32::Inst_VOP1__V_CVT_I32_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_i32_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_I32_F32
+
+    Inst_VOP1__V_CVT_I32_F32::~Inst_VOP1__V_CVT_I32_F32()
+    {
+    } // ~Inst_VOP1__V_CVT_I32_F32
+
+    // --- description from .arch file ---
+    // D.i = (int)S0.f.
+    // Out-of-range floating point values (including infinity) saturate. NaN is
+    // ---  converted to 0.
+    void
+    Inst_VOP1__V_CVT_I32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int exp;
+                std::frexp(src[lane],&exp);
+                if (std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else if (std::isinf(src[lane]) || exp > 30) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = INT_MIN;
+                    } else {
+                        vdst[lane] = INT_MAX;
+                    }
+                } else {
+                    vdst[lane] = (VecElemI32)src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_MOV_FED_B32 class methods ---
+
+    Inst_VOP1__V_MOV_FED_B32::Inst_VOP1__V_MOV_FED_B32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_mov_fed_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_MOV_FED_B32
+
+    Inst_VOP1__V_MOV_FED_B32::~Inst_VOP1__V_MOV_FED_B32()
+    {
+    } // ~Inst_VOP1__V_MOV_FED_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u;
+    // Introduce EDC double error upon write to dest vgpr without causing an
+    // ---  exception.
+    // Input and output modifiers not supported; this is an untyped operation.
+    void
+    Inst_VOP1__V_MOV_FED_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F16_F32 class methods ---
+
+    Inst_VOP1__V_CVT_F16_F32::Inst_VOP1__V_CVT_F16_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f16_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_F16_F32
+
+    Inst_VOP1__V_CVT_F16_F32::~Inst_VOP1__V_CVT_F16_F32()
+    {
+    } // ~Inst_VOP1__V_CVT_F16_F32
+
+    // --- description from .arch file ---
+    // D.f16 = flt32_to_flt16(S0.f).
+    // Supports input modifiers and creates FP16 denormals when appropriate.
+    void
+    Inst_VOP1__V_CVT_F16_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F32_F16 class methods ---
+
+    Inst_VOP1__V_CVT_F32_F16::Inst_VOP1__V_CVT_F32_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f32_f16")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_F32_F16
+
+    Inst_VOP1__V_CVT_F32_F16::~Inst_VOP1__V_CVT_F32_F16()
+    {
+    } // ~Inst_VOP1__V_CVT_F32_F16
+
+    // --- description from .arch file ---
+    // D.f = flt16_to_flt32(S0.f16).
+    // FP16 denormal inputs are always accepted.
+    void
+    Inst_VOP1__V_CVT_F32_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_CVT_RPI_I32_F32 class methods ---
+
+    Inst_VOP1__V_CVT_RPI_I32_F32::Inst_VOP1__V_CVT_RPI_I32_F32(
+          InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_rpi_i32_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_RPI_I32_F32
+
+    Inst_VOP1__V_CVT_RPI_I32_F32::~Inst_VOP1__V_CVT_RPI_I32_F32()
+    {
+    } // ~Inst_VOP1__V_CVT_RPI_I32_F32
+
+    // --- description from .arch file ---
+    // D.i = (int)floor(S0.f + 0.5).
+    void
+    Inst_VOP1__V_CVT_RPI_I32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemI32)std::floor(src[lane] + 0.5);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_FLR_I32_F32 class methods ---
+
+    Inst_VOP1__V_CVT_FLR_I32_F32::Inst_VOP1__V_CVT_FLR_I32_F32(
+          InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_flr_i32_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_FLR_I32_F32
+
+    Inst_VOP1__V_CVT_FLR_I32_F32::~Inst_VOP1__V_CVT_FLR_I32_F32()
+    {
+    } // ~Inst_VOP1__V_CVT_FLR_I32_F32
+
+    // --- description from .arch file ---
+    // D.i = (int)floor(S0.f).
+    void
+    Inst_VOP1__V_CVT_FLR_I32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemI32)std::floor(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_OFF_F32_I4 class methods ---
+
+    Inst_VOP1__V_CVT_OFF_F32_I4::Inst_VOP1__V_CVT_OFF_F32_I4(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_off_f32_i4")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_OFF_F32_I4
+
+    Inst_VOP1__V_CVT_OFF_F32_I4::~Inst_VOP1__V_CVT_OFF_F32_I4()
+    {
+    } // ~Inst_VOP1__V_CVT_OFF_F32_I4
+
+    // --- description from .arch file ---
+    // 4-bit signed int to 32-bit float. Used for interpolation in shader.
+    void
+    Inst_VOP1__V_CVT_OFF_F32_I4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        // Could not parse sq_uc.arch desc field
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F32_F64 class methods ---
+
+    Inst_VOP1__V_CVT_F32_F64::Inst_VOP1__V_CVT_F32_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f32_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_CVT_F32_F64
+
+    Inst_VOP1__V_CVT_F32_F64::~Inst_VOP1__V_CVT_F32_F64()
+    {
+    } // ~Inst_VOP1__V_CVT_F32_F64
+
+    // --- description from .arch file ---
+    // D.f = (float)S0.d.
+    void
+    Inst_VOP1__V_CVT_F32_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F64_F32 class methods ---
+
+    Inst_VOP1__V_CVT_F64_F32::Inst_VOP1__V_CVT_F64_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f64_f32")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_CVT_F64_F32
+
+    Inst_VOP1__V_CVT_F64_F32::~Inst_VOP1__V_CVT_F64_F32()
+    {
+    } // ~Inst_VOP1__V_CVT_F64_F32
+
+    // --- description from .arch file ---
+    // D.d = (double)S0.f.
+    void
+    Inst_VOP1__V_CVT_F64_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF64)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F32_UBYTE0 class methods ---
+
+    Inst_VOP1__V_CVT_F32_UBYTE0::Inst_VOP1__V_CVT_F32_UBYTE0(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte0")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_F32_UBYTE0
+
+    Inst_VOP1__V_CVT_F32_UBYTE0::~Inst_VOP1__V_CVT_F32_UBYTE0()
+    {
+    } // ~Inst_VOP1__V_CVT_F32_UBYTE0
+
+    // --- description from .arch file ---
+    // D.f = (float)(S0.u[7:0]).
+    void
+    Inst_VOP1__V_CVT_F32_UBYTE0::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)(bits(src[lane], 7, 0));
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F32_UBYTE1 class methods ---
+
+    Inst_VOP1__V_CVT_F32_UBYTE1::Inst_VOP1__V_CVT_F32_UBYTE1(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte1")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_F32_UBYTE1
+
+    Inst_VOP1__V_CVT_F32_UBYTE1::~Inst_VOP1__V_CVT_F32_UBYTE1()
+    {
+    } // ~Inst_VOP1__V_CVT_F32_UBYTE1
+
+    // --- description from .arch file ---
+    // D.f = (float)(S0.u[15:8]).
+    void
+    Inst_VOP1__V_CVT_F32_UBYTE1::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)(bits(src[lane], 15, 8));
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F32_UBYTE2 class methods ---
+
+    Inst_VOP1__V_CVT_F32_UBYTE2::Inst_VOP1__V_CVT_F32_UBYTE2(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte2")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_F32_UBYTE2
+
+    Inst_VOP1__V_CVT_F32_UBYTE2::~Inst_VOP1__V_CVT_F32_UBYTE2()
+    {
+    } // ~Inst_VOP1__V_CVT_F32_UBYTE2
+
+    // --- description from .arch file ---
+    // D.f = (float)(S0.u[23:16]).
+    void
+    Inst_VOP1__V_CVT_F32_UBYTE2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)(bits(src[lane], 23, 16));
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F32_UBYTE3 class methods ---
+
+    Inst_VOP1__V_CVT_F32_UBYTE3::Inst_VOP1__V_CVT_F32_UBYTE3(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte3")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_F32_UBYTE3
+
+    Inst_VOP1__V_CVT_F32_UBYTE3::~Inst_VOP1__V_CVT_F32_UBYTE3()
+    {
+    } // ~Inst_VOP1__V_CVT_F32_UBYTE3
+
+    // --- description from .arch file ---
+    // D.f = (float)(S0.u[31:24]).
+    void
+    Inst_VOP1__V_CVT_F32_UBYTE3::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)(bits(src[lane], 31, 24));
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_U32_F64 class methods ---
+
+    Inst_VOP1__V_CVT_U32_F64::Inst_VOP1__V_CVT_U32_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_u32_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_CVT_U32_F64
+
+    Inst_VOP1__V_CVT_U32_F64::~Inst_VOP1__V_CVT_U32_F64()
+    {
+    } // ~Inst_VOP1__V_CVT_U32_F64
+
+    // --- description from .arch file ---
+    // D.u = (unsigned)S0.d.
+    // Out-of-range floating point values (including infinity) saturate. NaN is
+    // ---  converted to 0.
+    void
+    Inst_VOP1__V_CVT_U32_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int exp;
+                std::frexp(src[lane],&exp);
+                if (std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else if (std::isinf(src[lane])) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = 0;
+                    } else {
+                        vdst[lane] = UINT_MAX;
+                    }
+                } else if (exp > 31) {
+                    vdst[lane] = UINT_MAX;
+                } else {
+                    vdst[lane] = (VecElemU32)src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F64_U32 class methods ---
+
+    Inst_VOP1__V_CVT_F64_U32::Inst_VOP1__V_CVT_F64_U32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f64_u32")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_CVT_F64_U32
+
+    Inst_VOP1__V_CVT_F64_U32::~Inst_VOP1__V_CVT_F64_U32()
+    {
+    } // ~Inst_VOP1__V_CVT_F64_U32
+
+    // --- description from .arch file ---
+    // D.d = (double)S0.u.
+    void
+    Inst_VOP1__V_CVT_F64_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF64)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_TRUNC_F64 class methods ---
+
+    Inst_VOP1__V_TRUNC_F64::Inst_VOP1__V_TRUNC_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_trunc_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_TRUNC_F64
+
+    Inst_VOP1__V_TRUNC_F64::~Inst_VOP1__V_TRUNC_F64()
+    {
+    } // ~Inst_VOP1__V_TRUNC_F64
+
+    // --- description from .arch file ---
+    // D.d = trunc(S0.d), return integer part of S0.d.
+    void
+    Inst_VOP1__V_TRUNC_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::trunc(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CEIL_F64 class methods ---
+
+    Inst_VOP1__V_CEIL_F64::Inst_VOP1__V_CEIL_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_ceil_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_CEIL_F64
+
+    Inst_VOP1__V_CEIL_F64::~Inst_VOP1__V_CEIL_F64()
+    {
+    } // ~Inst_VOP1__V_CEIL_F64
+
+    // --- description from .arch file ---
+    // D.d = trunc(S0.d);
+    // if (S0.d > 0.0 && S0.d != D.d) then D.d += 1.0.
+    void
+    Inst_VOP1__V_CEIL_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::ceil(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_RNDNE_F64 class methods ---
+
+    Inst_VOP1__V_RNDNE_F64::Inst_VOP1__V_RNDNE_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rndne_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_RNDNE_F64
+
+    Inst_VOP1__V_RNDNE_F64::~Inst_VOP1__V_RNDNE_F64()
+    {
+    } // ~Inst_VOP1__V_RNDNE_F64
+
+    // --- description from .arch file ---
+    // D.d = round_nearest_even(S0.d).
+    void
+    Inst_VOP1__V_RNDNE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = roundNearestEven(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FLOOR_F64 class methods ---
+
+    Inst_VOP1__V_FLOOR_F64::Inst_VOP1__V_FLOOR_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_floor_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_FLOOR_F64
+
+    Inst_VOP1__V_FLOOR_F64::~Inst_VOP1__V_FLOOR_F64()
+    {
+    } // ~Inst_VOP1__V_FLOOR_F64
+
+    // --- description from .arch file ---
+    // D.d = trunc(S0.d);
+    // if (S0.d < 0.0 && S0.d != D.d) then D.d += -1.0.
+    void
+    Inst_VOP1__V_FLOOR_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::floor(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FRACT_F32 class methods ---
+
+    Inst_VOP1__V_FRACT_F32::Inst_VOP1__V_FRACT_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_fract_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_FRACT_F32
+
+    Inst_VOP1__V_FRACT_F32::~Inst_VOP1__V_FRACT_F32()
+    {
+    } // ~Inst_VOP1__V_FRACT_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f - floor(S0.f).
+    void
+    Inst_VOP1__V_FRACT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemF32 int_part(0.0);
+                vdst[lane] = std::modf(src[lane], &int_part);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_TRUNC_F32 class methods ---
+
+    Inst_VOP1__V_TRUNC_F32::Inst_VOP1__V_TRUNC_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_trunc_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_TRUNC_F32
+
+    Inst_VOP1__V_TRUNC_F32::~Inst_VOP1__V_TRUNC_F32()
+    {
+    } // ~Inst_VOP1__V_TRUNC_F32
+
+    // --- description from .arch file ---
+    // D.f = trunc(S0.f), return integer part of S0.f.
+    void
+    Inst_VOP1__V_TRUNC_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst (gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::trunc(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CEIL_F32 class methods ---
+
+    Inst_VOP1__V_CEIL_F32::Inst_VOP1__V_CEIL_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_ceil_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CEIL_F32
+
+    Inst_VOP1__V_CEIL_F32::~Inst_VOP1__V_CEIL_F32()
+    {
+    } // ~Inst_VOP1__V_CEIL_F32
+
+    // --- description from .arch file ---
+    // D.f = trunc(S0.f);
+    // if (S0.f > 0.0 && S0.f != D.f) then D.f += 1.0.
+    void
+    Inst_VOP1__V_CEIL_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::ceil(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_RNDNE_F32 class methods ---
+
+    Inst_VOP1__V_RNDNE_F32::Inst_VOP1__V_RNDNE_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rndne_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_RNDNE_F32
+
+    Inst_VOP1__V_RNDNE_F32::~Inst_VOP1__V_RNDNE_F32()
+    {
+    } // ~Inst_VOP1__V_RNDNE_F32
+
+    // --- description from .arch file ---
+    // D.f = round_nearest_even(S0.f).
+    void
+    Inst_VOP1__V_RNDNE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = roundNearestEven(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FLOOR_F32 class methods ---
+
+    Inst_VOP1__V_FLOOR_F32::Inst_VOP1__V_FLOOR_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_floor_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_FLOOR_F32
+
+    Inst_VOP1__V_FLOOR_F32::~Inst_VOP1__V_FLOOR_F32()
+    {
+    } // ~Inst_VOP1__V_FLOOR_F32
+
+    // --- description from .arch file ---
+    // D.f = trunc(S0.f);
+    // if (S0.f < 0.0 && S0.f != D.f) then D.f += -1.0.
+    void
+    Inst_VOP1__V_FLOOR_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::floor(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_EXP_F32 class methods ---
+
+    Inst_VOP1__V_EXP_F32::Inst_VOP1__V_EXP_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_exp_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_EXP_F32
+
+    Inst_VOP1__V_EXP_F32::~Inst_VOP1__V_EXP_F32()
+    {
+    } // ~Inst_VOP1__V_EXP_F32
+
+    // --- description from .arch file ---
+    // D.f = pow(2.0, S0.f).
+    void
+    Inst_VOP1__V_EXP_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::pow(2.0, src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_LOG_F32 class methods ---
+
+    Inst_VOP1__V_LOG_F32::Inst_VOP1__V_LOG_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_log_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_LOG_F32
+
+    Inst_VOP1__V_LOG_F32::~Inst_VOP1__V_LOG_F32()
+    {
+    } // ~Inst_VOP1__V_LOG_F32
+
+    // --- description from .arch file ---
+    // D.f = log2(S0.f). Base 2 logarithm.
+    void
+    Inst_VOP1__V_LOG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::log2(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_RCP_F32 class methods ---
+
+    Inst_VOP1__V_RCP_F32::Inst_VOP1__V_RCP_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rcp_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_RCP_F32
+
+    Inst_VOP1__V_RCP_F32::~Inst_VOP1__V_RCP_F32()
+    {
+    } // ~Inst_VOP1__V_RCP_F32
+
+    // --- description from .arch file ---
+    // D.f = 1.0 / S0.f. Reciprocal with IEEE rules and < 1ulp error.
+    void
+    Inst_VOP1__V_RCP_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = 1.0 / src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_RCP_IFLAG_F32 class methods ---
+
+    Inst_VOP1__V_RCP_IFLAG_F32::Inst_VOP1__V_RCP_IFLAG_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rcp_iflag_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_RCP_IFLAG_F32
+
+    Inst_VOP1__V_RCP_IFLAG_F32::~Inst_VOP1__V_RCP_IFLAG_F32()
+    {
+    } // ~Inst_VOP1__V_RCP_IFLAG_F32
+
+    // --- description from .arch file ---
+    // D.f = 1.0 / S0.f. Reciprocal intended for integer division, can raise
+    // ---  integer DIV_BY_ZERO exception but cannot raise floating-point
+    // ---  exceptions.
+    void
+    Inst_VOP1__V_RCP_IFLAG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = 1.0 / src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_RSQ_F32 class methods ---
+
+    Inst_VOP1__V_RSQ_F32::Inst_VOP1__V_RSQ_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rsq_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_RSQ_F32
+
+    Inst_VOP1__V_RSQ_F32::~Inst_VOP1__V_RSQ_F32()
+    {
+    } // ~Inst_VOP1__V_RSQ_F32
+
+    // --- description from .arch file ---
+    // D.f = 1.0 / sqrt(S0.f). Reciprocal square root with IEEE rules.
+    void
+    Inst_VOP1__V_RSQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = 1.0 / std::sqrt(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_RCP_F64 class methods ---
+
+    Inst_VOP1__V_RCP_F64::Inst_VOP1__V_RCP_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rcp_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_RCP_F64
+
+    Inst_VOP1__V_RCP_F64::~Inst_VOP1__V_RCP_F64()
+    {
+    } // ~Inst_VOP1__V_RCP_F64
+
+    // --- description from .arch file ---
+    // D.d = 1.0 / S0.d.
+    void
+    Inst_VOP1__V_RCP_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::fpclassify(src[lane]) == FP_ZERO) {
+                    vdst[lane] = +INFINITY;
+                } else if (std::isnan(src[lane])) {
+                    vdst[lane] = NAN;
+                } else if (std::isinf(src[lane])) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = -0.0;
+                    } else {
+                        vdst[lane] = 0.0;
+                    }
+                } else {
+                    vdst[lane] = 1.0 / src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_RSQ_F64 class methods ---
+
+    Inst_VOP1__V_RSQ_F64::Inst_VOP1__V_RSQ_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rsq_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_RSQ_F64
+
+    Inst_VOP1__V_RSQ_F64::~Inst_VOP1__V_RSQ_F64()
+    {
+    } // ~Inst_VOP1__V_RSQ_F64
+
+    // --- description from .arch file ---
+    // D.d = 1.0 / sqrt(S0.d). See V_RSQ_F32.
+    void
+    Inst_VOP1__V_RSQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::fpclassify(src[lane]) == FP_ZERO) {
+                    vdst[lane] = +INFINITY;
+                } else if (std::isnan(src[lane])) {
+                    vdst[lane] = NAN;
+                } else if (std::isinf(src[lane])
+                           && !std::signbit(src[lane])) {
+                    vdst[lane] = 0.0;
+                } else if (std::signbit(src[lane])) {
+                    vdst[lane] = NAN;
+                } else {
+                    vdst[lane] = 1.0 / std::sqrt(src[lane]);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_SQRT_F32 class methods ---
+
+    Inst_VOP1__V_SQRT_F32::Inst_VOP1__V_SQRT_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_sqrt_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_SQRT_F32
+
+    Inst_VOP1__V_SQRT_F32::~Inst_VOP1__V_SQRT_F32()
+    {
+    } // ~Inst_VOP1__V_SQRT_F32
+
+    // --- description from .arch file ---
+    // D.f = sqrt(S0.f).
+    void
+    Inst_VOP1__V_SQRT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::sqrt(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_SQRT_F64 class methods ---
+
+    Inst_VOP1__V_SQRT_F64::Inst_VOP1__V_SQRT_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_sqrt_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_SQRT_F64
+
+    Inst_VOP1__V_SQRT_F64::~Inst_VOP1__V_SQRT_F64()
+    {
+    } // ~Inst_VOP1__V_SQRT_F64
+
+    // --- description from .arch file ---
+    // D.d = sqrt(S0.d).
+    void
+    Inst_VOP1__V_SQRT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::sqrt(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_SIN_F32 class methods ---
+
+    Inst_VOP1__V_SIN_F32::Inst_VOP1__V_SIN_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_sin_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_SIN_F32
+
+    Inst_VOP1__V_SIN_F32::~Inst_VOP1__V_SIN_F32()
+    {
+    } // ~Inst_VOP1__V_SIN_F32
+
+    // --- description from .arch file ---
+    // D.f = sin(S0.f * 2 * PI).
+    // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
+    // float 0.0.
+    void
+    Inst_VOP1__V_SIN_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+        pi.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (src[lane] < -256.0 || src[lane] > 256.0) {
+                    vdst[lane] = 0.0;
+                } else {
+                    vdst[lane] = std::sin(src[lane] * 2.0 * pi.rawData());
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_COS_F32 class methods ---
+
+    Inst_VOP1__V_COS_F32::Inst_VOP1__V_COS_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cos_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_COS_F32
+
+    Inst_VOP1__V_COS_F32::~Inst_VOP1__V_COS_F32()
+    {
+    } // ~Inst_VOP1__V_COS_F32
+
+    // --- description from .arch file ---
+    // D.f = cos(S0.f * 2 * PI).
+    // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
+    // float 1.0.
+    void
+    Inst_VOP1__V_COS_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+        pi.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (src[lane] < -256.0 || src[lane] > 256.0) {
+                    vdst[lane] = 0.0;
+                } else {
+                    vdst[lane] = std::cos(src[lane] * 2.0 * pi.rawData());
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_NOT_B32 class methods ---
+
+    Inst_VOP1__V_NOT_B32::Inst_VOP1__V_NOT_B32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_not_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_NOT_B32
+
+    Inst_VOP1__V_NOT_B32::~Inst_VOP1__V_NOT_B32()
+    {
+    } // ~Inst_VOP1__V_NOT_B32
+
+    // --- description from .arch file ---
+    // D.u = ~S0.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP1__V_NOT_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = ~src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_BFREV_B32 class methods ---
+
+    Inst_VOP1__V_BFREV_B32::Inst_VOP1__V_BFREV_B32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_bfrev_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_BFREV_B32
+
+    Inst_VOP1__V_BFREV_B32::~Inst_VOP1__V_BFREV_B32()
+    {
+    } // ~Inst_VOP1__V_BFREV_B32
+
+    // --- description from .arch file ---
+    // D.u[31:0] = S0.u[0:31], bitfield reverse.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP1__V_BFREV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = reverseBits(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FFBH_U32 class methods ---
+
+    Inst_VOP1__V_FFBH_U32::Inst_VOP1__V_FFBH_U32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_ffbh_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_FFBH_U32
+
+    Inst_VOP1__V_FFBH_U32::~Inst_VOP1__V_FFBH_U32()
+    {
+    } // ~Inst_VOP1__V_FFBH_U32
+
+    // --- description from .arch file ---
+    // D.u = position of first 1 in S0.u from MSB;
+    // D.u = 0xffffffff if S0.u == 0.
+    void
+    Inst_VOP1__V_FFBH_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = findFirstOneMsb(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FFBL_B32 class methods ---
+
+    Inst_VOP1__V_FFBL_B32::Inst_VOP1__V_FFBL_B32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_ffbl_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_FFBL_B32
+
+    Inst_VOP1__V_FFBL_B32::~Inst_VOP1__V_FFBL_B32()
+    {
+    } // ~Inst_VOP1__V_FFBL_B32
+
+    // --- description from .arch file ---
+    // D.u = position of first 1 in S0.u from LSB;
+    // D.u = 0xffffffff if S0.u == 0.
+    void
+    Inst_VOP1__V_FFBL_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = findFirstOne(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FFBH_I32 class methods ---
+
+    Inst_VOP1__V_FFBH_I32::Inst_VOP1__V_FFBH_I32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_ffbh_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_FFBH_I32
+
+    Inst_VOP1__V_FFBH_I32::~Inst_VOP1__V_FFBH_I32()
+    {
+    } // ~Inst_VOP1__V_FFBH_I32
+
+    // --- description from .arch file ---
+    // D.u = position of first bit different from sign bit in S0.i from MSB;
+    // D.u = 0xffffffff if S0.i == 0 or S0.i == 0xffffffff.
+    void
+    Inst_VOP1__V_FFBH_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = firstOppositeSignBit(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FREXP_EXP_I32_F64 class methods ---
+
+    Inst_VOP1__V_FREXP_EXP_I32_F64::Inst_VOP1__V_FREXP_EXP_I32_F64(
+          InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_frexp_exp_i32_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_FREXP_EXP_I32_F64
+
+    Inst_VOP1__V_FREXP_EXP_I32_F64::~Inst_VOP1__V_FREXP_EXP_I32_F64()
+    {
+    } // ~Inst_VOP1__V_FREXP_EXP_I32_F64
+
+    // --- description from .arch file ---
+    // See V_FREXP_EXP_I32_F32.
+    void
+    Inst_VOP1__V_FREXP_EXP_I32_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else {
+                    VecElemI32 exp = 0;
+                    std::frexp(src[lane], &exp);
+                    vdst[lane] = exp;
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FREXP_MANT_F64 class methods ---
+
+    Inst_VOP1__V_FREXP_MANT_F64::Inst_VOP1__V_FREXP_MANT_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_frexp_mant_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_FREXP_MANT_F64
+
+    Inst_VOP1__V_FREXP_MANT_F64::~Inst_VOP1__V_FREXP_MANT_F64()
+    {
+    } // ~Inst_VOP1__V_FREXP_MANT_F64
+
+    // --- description from .arch file ---
+    // See V_FREXP_MANT_F32.
+    void
+    Inst_VOP1__V_FREXP_MANT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
+                    vdst[lane] = src[lane];
+                } else {
+                    VecElemI32 exp(0);
+                    vdst[lane] = std::frexp(src[lane], &exp);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FRACT_F64 class methods ---
+
+    Inst_VOP1__V_FRACT_F64::Inst_VOP1__V_FRACT_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_fract_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_FRACT_F64
+
+    Inst_VOP1__V_FRACT_F64::~Inst_VOP1__V_FRACT_F64()
+    {
+    } // ~Inst_VOP1__V_FRACT_F64
+
+    // --- description from .arch file ---
+    // See V_FRACT_F32.
+    void
+    Inst_VOP1__V_FRACT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemF64 int_part(0.0);
+                vdst[lane] = std::modf(src[lane], &int_part);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FREXP_EXP_I32_F32 class methods ---
+
+    Inst_VOP1__V_FREXP_EXP_I32_F32::Inst_VOP1__V_FREXP_EXP_I32_F32(
+          InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_frexp_exp_i32_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_FREXP_EXP_I32_F32
+
+    Inst_VOP1__V_FREXP_EXP_I32_F32::~Inst_VOP1__V_FREXP_EXP_I32_F32()
+    {
+    } // ~Inst_VOP1__V_FREXP_EXP_I32_F32
+
+    // --- description from .arch file ---
+    // if (S0.f == INF || S0.f == NAN) then D.i = 0;
+    // else D.i = TwosComplement(Exponent(S0.f) - 127 + 1).
+    // Returns exponent of single precision float input, such that S0.f =
+    // significand * (2 ** exponent). See also FREXP_MANT_F32, which returns
+    // the significand.
+    void
+    Inst_VOP1__V_FREXP_EXP_I32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else {
+                    VecElemI32 exp(0);
+                    std::frexp(src[lane], &exp);
+                    vdst[lane] = exp;
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FREXP_MANT_F32 class methods ---
+
+    Inst_VOP1__V_FREXP_MANT_F32::Inst_VOP1__V_FREXP_MANT_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_frexp_mant_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_FREXP_MANT_F32
+
+    Inst_VOP1__V_FREXP_MANT_F32::~Inst_VOP1__V_FREXP_MANT_F32()
+    {
+    } // ~Inst_VOP1__V_FREXP_MANT_F32
+
+    // --- description from .arch file ---
+    // if (S0.f == INF || S0.f == NAN) then D.f = S0.f;
+    // else D.f = Mantissa(S0.f).
+    // Result range is in (-1.0,-0.5][0.5,1.0) in normal cases. Returns binary
+    // ---  significand of single precision float input, such that S0.f =
+    // ---  significand * (2 ** exponent). See also FREXP_EXP_I32_F32, which
+    // ---  returns integer exponent.
+    void
+    Inst_VOP1__V_FREXP_MANT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
+                    vdst[lane] = src[lane];
+                } else {
+                    VecElemI32 exp(0);
+                    vdst[lane] = std::frexp(src[lane], &exp);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CLREXCP class methods ---
+
+    Inst_VOP1__V_CLREXCP::Inst_VOP1__V_CLREXCP(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_clrexcp")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_CLREXCP
+
+    Inst_VOP1__V_CLREXCP::~Inst_VOP1__V_CLREXCP()
+    {
+    } // ~Inst_VOP1__V_CLREXCP
+
+    // --- description from .arch file ---
+    // Clear wave's exception state in SIMD (SP).
+    void
+    Inst_VOP1__V_CLREXCP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F16_U16 class methods ---
+
+    Inst_VOP1__V_CVT_F16_U16::Inst_VOP1__V_CVT_F16_U16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f16_u16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_CVT_F16_U16
+
+    Inst_VOP1__V_CVT_F16_U16::~Inst_VOP1__V_CVT_F16_U16()
+    {
+    } // ~Inst_VOP1__V_CVT_F16_U16
+
+    // --- description from .arch file ---
+    // D.f16 = uint16_to_flt16(S.u16).
+    // Supports denormals, rounding, exception flags and saturation.
+    void
+    Inst_VOP1__V_CVT_F16_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F16_I16 class methods ---
+
+    Inst_VOP1__V_CVT_F16_I16::Inst_VOP1__V_CVT_F16_I16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f16_i16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_CVT_F16_I16
+
+    Inst_VOP1__V_CVT_F16_I16::~Inst_VOP1__V_CVT_F16_I16()
+    {
+    } // ~Inst_VOP1__V_CVT_F16_I16
+
+    // --- description from .arch file ---
+    // D.f16 = int16_to_flt16(S.i16).
+    // Supports denormals, rounding, exception flags and saturation.
+    void
+    Inst_VOP1__V_CVT_F16_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_CVT_U16_F16 class methods ---
+
+    Inst_VOP1__V_CVT_U16_F16::Inst_VOP1__V_CVT_U16_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_u16_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_CVT_U16_F16
+
+    Inst_VOP1__V_CVT_U16_F16::~Inst_VOP1__V_CVT_U16_F16()
+    {
+    } // ~Inst_VOP1__V_CVT_U16_F16
+
+    // --- description from .arch file ---
+    // D.u16 = flt16_to_uint16(S.f16).
+    // Supports rounding, exception flags and saturation.
+    void
+    Inst_VOP1__V_CVT_U16_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_CVT_I16_F16 class methods ---
+
+    Inst_VOP1__V_CVT_I16_F16::Inst_VOP1__V_CVT_I16_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_i16_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_CVT_I16_F16
+
+    Inst_VOP1__V_CVT_I16_F16::~Inst_VOP1__V_CVT_I16_F16()
+    {
+    } // ~Inst_VOP1__V_CVT_I16_F16
+
+    // --- description from .arch file ---
+    // D.i16 = flt16_to_int16(S.f16).
+    // Supports rounding, exception flags and saturation.
+    void
+    Inst_VOP1__V_CVT_I16_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_RCP_F16 class methods ---
+
+    Inst_VOP1__V_RCP_F16::Inst_VOP1__V_RCP_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rcp_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_RCP_F16
+
+    Inst_VOP1__V_RCP_F16::~Inst_VOP1__V_RCP_F16()
+    {
+    } // ~Inst_VOP1__V_RCP_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 1.0f)
+    //     D.f16 = 1.0f;
+    // else
+    //     D.f16 = ApproximateRecip(S0.f16).
+    void
+    Inst_VOP1__V_RCP_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_SQRT_F16 class methods ---
+
+    Inst_VOP1__V_SQRT_F16::Inst_VOP1__V_SQRT_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_sqrt_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_SQRT_F16
+
+    Inst_VOP1__V_SQRT_F16::~Inst_VOP1__V_SQRT_F16()
+    {
+    } // ~Inst_VOP1__V_SQRT_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 1.0f)
+    //     D.f16 = 1.0f;
+    // else
+    //     D.f16 = ApproximateSqrt(S0.f16).
+    void
+    Inst_VOP1__V_SQRT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_RSQ_F16 class methods ---
+
+    Inst_VOP1__V_RSQ_F16::Inst_VOP1__V_RSQ_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rsq_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_RSQ_F16
+
+    Inst_VOP1__V_RSQ_F16::~Inst_VOP1__V_RSQ_F16()
+    {
+    } // ~Inst_VOP1__V_RSQ_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 1.0f)
+    //     D.f16 = 1.0f;
+    // else
+    //     D.f16 = ApproximateRecipSqrt(S0.f16).
+    void
+    Inst_VOP1__V_RSQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_LOG_F16 class methods ---
+
+    Inst_VOP1__V_LOG_F16::Inst_VOP1__V_LOG_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_log_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_LOG_F16
+
+    Inst_VOP1__V_LOG_F16::~Inst_VOP1__V_LOG_F16()
+    {
+    } // ~Inst_VOP1__V_LOG_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 1.0f)
+    //     D.f16 = 0.0f;
+    // else
+    //     D.f16 = ApproximateLog2(S0.f16).
+    void
+    Inst_VOP1__V_LOG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_EXP_F16 class methods ---
+
+    Inst_VOP1__V_EXP_F16::Inst_VOP1__V_EXP_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_exp_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_EXP_F16
+
+    Inst_VOP1__V_EXP_F16::~Inst_VOP1__V_EXP_F16()
+    {
+    } // ~Inst_VOP1__V_EXP_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 0.0f)
+    //     D.f16 = 1.0f;
+    // else
+    //     D.f16 = Approximate2ToX(S0.f16).
+    void
+    Inst_VOP1__V_EXP_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_FREXP_MANT_F16 class methods ---
+
+    Inst_VOP1__V_FREXP_MANT_F16::Inst_VOP1__V_FREXP_MANT_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_frexp_mant_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_FREXP_MANT_F16
+
+    Inst_VOP1__V_FREXP_MANT_F16::~Inst_VOP1__V_FREXP_MANT_F16()
+    {
+    } // ~Inst_VOP1__V_FREXP_MANT_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == +-INF || S0.f16 == NAN)
+    //     D.f16 = S0.f16;
+    // else
+    //     D.f16 = mantissa(S0.f16).
+    // Result range is (-1.0,-0.5][0.5,1.0).
+    // C math library frexp function.
+    // Returns binary significand of half precision float input, such that the
+    // original single float = significand * (2 ** exponent).
+    void
+    Inst_VOP1__V_FREXP_MANT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_FREXP_EXP_I16_F16 class methods ---
+
+    Inst_VOP1__V_FREXP_EXP_I16_F16::Inst_VOP1__V_FREXP_EXP_I16_F16(
+          InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_frexp_exp_i16_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_FREXP_EXP_I16_F16
+
+    Inst_VOP1__V_FREXP_EXP_I16_F16::~Inst_VOP1__V_FREXP_EXP_I16_F16()
+    {
+    } // ~Inst_VOP1__V_FREXP_EXP_I16_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == +-INF || S0.f16 == NAN)
+    //     D.i16 = 0;
+    // else
+    //     D.i16 = 2s_complement(exponent(S0.f16) - 15 + 1).
+    // C math library frexp function.
+    // Returns exponent of half precision float input, such that the
+    // original single float = significand * (2 ** exponent).
+    void
+    Inst_VOP1__V_FREXP_EXP_I16_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_FLOOR_F16 class methods ---
+
+    Inst_VOP1__V_FLOOR_F16::Inst_VOP1__V_FLOOR_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_floor_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_FLOOR_F16
+
+    Inst_VOP1__V_FLOOR_F16::~Inst_VOP1__V_FLOOR_F16()
+    {
+    } // ~Inst_VOP1__V_FLOOR_F16
+
+    // --- description from .arch file ---
+    // D.f16 = trunc(S0.f16);
+    // if (S0.f16 < 0.0f && S0.f16 != D.f16) then D.f16 -= 1.0f.
+    void
+    Inst_VOP1__V_FLOOR_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_CEIL_F16 class methods ---
+
+    Inst_VOP1__V_CEIL_F16::Inst_VOP1__V_CEIL_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_ceil_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_CEIL_F16
+
+    Inst_VOP1__V_CEIL_F16::~Inst_VOP1__V_CEIL_F16()
+    {
+    } // ~Inst_VOP1__V_CEIL_F16
+
+    // --- description from .arch file ---
+    // D.f16 = trunc(S0.f16);
+    // if (S0.f16 > 0.0f && S0.f16 != D.f16) then D.f16 += 1.0f.
+    void
+    Inst_VOP1__V_CEIL_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_TRUNC_F16 class methods ---
+
+    Inst_VOP1__V_TRUNC_F16::Inst_VOP1__V_TRUNC_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_trunc_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_TRUNC_F16
+
+    Inst_VOP1__V_TRUNC_F16::~Inst_VOP1__V_TRUNC_F16()
+    {
+    } // ~Inst_VOP1__V_TRUNC_F16
+
+    // --- description from .arch file ---
+    // D.f16 = trunc(S0.f16).
+    // Round-to-zero semantics.
+    void
+    Inst_VOP1__V_TRUNC_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_RNDNE_F16 class methods ---
+
+    Inst_VOP1__V_RNDNE_F16::Inst_VOP1__V_RNDNE_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rndne_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_RNDNE_F16
+
+    Inst_VOP1__V_RNDNE_F16::~Inst_VOP1__V_RNDNE_F16()
+    {
+    } // ~Inst_VOP1__V_RNDNE_F16
+
+    // --- description from .arch file ---
+    // D.f16 = FLOOR(S0.f16 + 0.5f);
+    // if (floor(S0.f16) is even && fract(S0.f16) == 0.5f) then D.f16 -= 1.0f.
+    // Round-to-nearest-even semantics.
+    void
+    Inst_VOP1__V_RNDNE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_FRACT_F16 class methods ---
+
+    Inst_VOP1__V_FRACT_F16::Inst_VOP1__V_FRACT_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_fract_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_FRACT_F16
+
+    Inst_VOP1__V_FRACT_F16::~Inst_VOP1__V_FRACT_F16()
+    {
+    } // ~Inst_VOP1__V_FRACT_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 + -floor(S0.f16).
+    void
+    Inst_VOP1__V_FRACT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_SIN_F16 class methods ---
+
+    Inst_VOP1__V_SIN_F16::Inst_VOP1__V_SIN_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_sin_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_SIN_F16
+
+    Inst_VOP1__V_SIN_F16::~Inst_VOP1__V_SIN_F16()
+    {
+    } // ~Inst_VOP1__V_SIN_F16
+
+    // --- description from .arch file ---
+    // D.f16 = sin(S0.f16 * 2 * PI).
+    void
+    Inst_VOP1__V_SIN_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_COS_F16 class methods ---
+
+    Inst_VOP1__V_COS_F16::Inst_VOP1__V_COS_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cos_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_COS_F16
+
+    Inst_VOP1__V_COS_F16::~Inst_VOP1__V_COS_F16()
+    {
+    } // ~Inst_VOP1__V_COS_F16
+
+    // --- description from .arch file ---
+    // D.f16 = cos(S0.f16 * 2 * PI).
+    void
+    Inst_VOP1__V_COS_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_EXP_LEGACY_F32 class methods ---
+
+    Inst_VOP1__V_EXP_LEGACY_F32::Inst_VOP1__V_EXP_LEGACY_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_exp_legacy_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_EXP_LEGACY_F32
+
+    Inst_VOP1__V_EXP_LEGACY_F32::~Inst_VOP1__V_EXP_LEGACY_F32()
+    {
+    } // ~Inst_VOP1__V_EXP_LEGACY_F32
+
+    // --- description from .arch file ---
+    // D.f = pow(2.0, S0.f) with legacy semantics.
+    void
+    Inst_VOP1__V_EXP_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::pow(2.0, src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_LOG_LEGACY_F32 class methods ---
+
+    Inst_VOP1__V_LOG_LEGACY_F32::Inst_VOP1__V_LOG_LEGACY_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_log_legacy_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_LOG_LEGACY_F32
+
+    Inst_VOP1__V_LOG_LEGACY_F32::~Inst_VOP1__V_LOG_LEGACY_F32()
+    {
+    } // ~Inst_VOP1__V_LOG_LEGACY_F32
+
+    // --- description from .arch file ---
+    // D.f = log2(S0.f). Base 2 logarithm with legacy semantics.
+    void
+    Inst_VOP1__V_LOG_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::log2(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/vop2.cc b/src/arch/amdgpu/vega/insts/vop2.cc
new file mode 100644
index 0000000000..ddd77e27da
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/vop2.cc
@@ -0,0 +1,2187 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/inst_util.hh"
+#include "arch/amdgpu/vega/insts/instructions.hh"
+#include "debug/VEGA.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_VOP2__V_CNDMASK_B32 class methods ---
+
+    Inst_VOP2__V_CNDMASK_B32::Inst_VOP2__V_CNDMASK_B32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_cndmask_b32")
+    {
+        setFlag(ALU);
+        setFlag(ReadsVCC);
+    } // Inst_VOP2__V_CNDMASK_B32
+
+    Inst_VOP2__V_CNDMASK_B32::~Inst_VOP2__V_CNDMASK_B32()
+    {
+    } // ~Inst_VOP2__V_CNDMASK_B32
+
+    // --- description from .arch file ---
+    // D.u = (VCC[i] ? S1.u : S0.u) (i = threadID in wave); VOP3: specify VCC
+    // as a scalar GPR in S2.
+    void
+    Inst_VOP2__V_CNDMASK_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+        vcc.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane]
+                    = bits(vcc.rawData(), lane) ? src1[lane] : src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_ADD_F32 class methods ---
+
+    Inst_VOP2__V_ADD_F32::Inst_VOP2__V_ADD_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_add_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP2__V_ADD_F32
+
+    Inst_VOP2__V_ADD_F32::~Inst_VOP2__V_ADD_F32()
+    {
+    } // ~Inst_VOP2__V_ADD_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f + S1.f.
+    void
+    Inst_VOP2__V_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        VecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        if (isDPPInst()) {
+            VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
+            src0_dpp.read();
+
+            DPRINTF(VEGA, "Handling V_ADD_F32 SRC DPP. SRC0: register v[%d], "
+                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
+                    "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
+                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
+                    extData.iFmt_VOP_DPP.DPP_CTRL,
+                    extData.iFmt_VOP_DPP.SRC0_ABS,
+                    extData.iFmt_VOP_DPP.SRC0_NEG,
+                    extData.iFmt_VOP_DPP.SRC1_ABS,
+                    extData.iFmt_VOP_DPP.SRC1_NEG,
+                    extData.iFmt_VOP_DPP.BC,
+                    extData.iFmt_VOP_DPP.BANK_MASK,
+                    extData.iFmt_VOP_DPP.ROW_MASK);
+
+            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0_dpp[lane] + src1[lane];
+                }
+            }
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0[lane] + src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_SUB_F32 class methods ---
+
+    Inst_VOP2__V_SUB_F32::Inst_VOP2__V_SUB_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_sub_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP2__V_SUB_F32
+
+    Inst_VOP2__V_SUB_F32::~Inst_VOP2__V_SUB_F32()
+    {
+    } // ~Inst_VOP2__V_SUB_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f - S1.f.
+    // SQ translates to V_ADD_F32.
+    void
+    Inst_VOP2__V_SUB_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] - src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_SUBREV_F32 class methods ---
+
+    Inst_VOP2__V_SUBREV_F32::Inst_VOP2__V_SUBREV_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_subrev_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP2__V_SUBREV_F32
+
+    Inst_VOP2__V_SUBREV_F32::~Inst_VOP2__V_SUBREV_F32()
+    {
+    } // ~Inst_VOP2__V_SUBREV_F32
+
+    // --- description from .arch file ---
+    // D.f = S1.f - S0.f.
+    // SQ translates to V_ADD_F32.
+    void
+    Inst_VOP2__V_SUBREV_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] - src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MUL_LEGACY_F32 class methods ---
+
+    Inst_VOP2__V_MUL_LEGACY_F32::Inst_VOP2__V_MUL_LEGACY_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mul_legacy_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP2__V_MUL_LEGACY_F32
+
+    Inst_VOP2__V_MUL_LEGACY_F32::~Inst_VOP2__V_MUL_LEGACY_F32()
+    {
+    } // ~Inst_VOP2__V_MUL_LEGACY_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f (DX9 rules, 0.0*x = 0.0).
+    void
+    Inst_VOP2__V_MUL_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] * src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MUL_F32 class methods ---
+
+    Inst_VOP2__V_MUL_F32::Inst_VOP2__V_MUL_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mul_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP2__V_MUL_F32
+
+    Inst_VOP2__V_MUL_F32::~Inst_VOP2__V_MUL_F32()
+    {
+    } // ~Inst_VOP2__V_MUL_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f.
+    void
+    Inst_VOP2__V_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isnan(src0[lane]) ||
+                    std::isnan(src1[lane])) {
+                    vdst[lane] = NAN;
+                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src0[lane]) == FP_ZERO) &&
+                           !std::signbit(src0[lane])) {
+                    if (std::isinf(src1[lane])) {
+                        vdst[lane] = NAN;
+                    } else if (!std::signbit(src1[lane])) {
+                        vdst[lane] = +0.0;
+                    } else {
+                        vdst[lane] = -0.0;
+                    }
+                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src0[lane]) == FP_ZERO) &&
+                           std::signbit(src0[lane])) {
+                    if (std::isinf(src1[lane])) {
+                        vdst[lane] = NAN;
+                    } else if (std::signbit(src1[lane])) {
+                        vdst[lane] = +0.0;
+                    } else {
+                        vdst[lane] = -0.0;
+                    }
+                } else if (std::isinf(src0[lane]) &&
+                           !std::signbit(src0[lane])) {
+                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src1[lane]) == FP_ZERO) {
+                        vdst[lane] = NAN;
+                    } else if (!std::signbit(src1[lane])) {
+                        vdst[lane] = +INFINITY;
+                    } else {
+                        vdst[lane] = -INFINITY;
+                    }
+                } else if (std::isinf(src0[lane]) &&
+                           std::signbit(src0[lane])) {
+                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src1[lane]) == FP_ZERO) {
+                        vdst[lane] = NAN;
+                    } else if (std::signbit(src1[lane])) {
+                        vdst[lane] = +INFINITY;
+                    } else {
+                        vdst[lane] = -INFINITY;
+                    }
+                } else {
+                    vdst[lane] = src0[lane] * src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MUL_I32_I24 class methods ---
+
+    Inst_VOP2__V_MUL_I32_I24::Inst_VOP2__V_MUL_I32_I24(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mul_i32_i24")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MUL_I32_I24
+
+    Inst_VOP2__V_MUL_I32_I24::~Inst_VOP2__V_MUL_I32_I24()
+    {
+    } // ~Inst_VOP2__V_MUL_I32_I24
+
+    // --- description from .arch file ---
+    // D.i = S0.i[23:0] * S1.i[23:0].
+    void
+    Inst_VOP2__V_MUL_I32_I24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
+                    * sext<24>(bits(src1[lane], 23, 0));
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MUL_HI_I32_I24 class methods ---
+
+    Inst_VOP2__V_MUL_HI_I32_I24::Inst_VOP2__V_MUL_HI_I32_I24(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mul_hi_i32_i24")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MUL_HI_I32_I24
+
+    Inst_VOP2__V_MUL_HI_I32_I24::~Inst_VOP2__V_MUL_HI_I32_I24()
+    {
+    } // ~Inst_VOP2__V_MUL_HI_I32_I24
+
+    // --- description from .arch file ---
+    // D.i = (S0.i[23:0] * S1.i[23:0])>>32.
+    void
+    Inst_VOP2__V_MUL_HI_I32_I24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemI64 tmp_src0
+                    = (VecElemI64)sext<24>(bits(src0[lane], 23, 0));
+                VecElemI64 tmp_src1
+                    = (VecElemI64)sext<24>(bits(src1[lane], 23, 0));
+
+                vdst[lane] = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MUL_U32_U24 class methods ---
+
+    Inst_VOP2__V_MUL_U32_U24::Inst_VOP2__V_MUL_U32_U24(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mul_u32_u24")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MUL_U32_U24
+
+    Inst_VOP2__V_MUL_U32_U24::~Inst_VOP2__V_MUL_U32_U24()
+    {
+    } // ~Inst_VOP2__V_MUL_U32_U24
+
+    // --- description from .arch file ---
+    // D.u = S0.u[23:0] * S1.u[23:0].
+    void
+    Inst_VOP2__V_MUL_U32_U24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        auto opImpl = [](VecOperandU32& src0, VecOperandU32& src1,
+                         VecOperandU32& vdst, Wavefront* wf) {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = bits(src0[lane], 23, 0) *
+                                 bits(src1[lane], 23, 0);
+                }
+            }
+        };
+
+        vop2Helper<ConstVecOperandU32, VecOperandU32>(gpuDynInst, opImpl);
+    } // execute
+    // --- Inst_VOP2__V_MUL_HI_U32_U24 class methods ---
+
+    Inst_VOP2__V_MUL_HI_U32_U24::Inst_VOP2__V_MUL_HI_U32_U24(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mul_hi_u32_u24")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MUL_HI_U32_U24
+
+    Inst_VOP2__V_MUL_HI_U32_U24::~Inst_VOP2__V_MUL_HI_U32_U24()
+    {
+    } // ~Inst_VOP2__V_MUL_HI_U32_U24
+
+    // --- description from .arch file ---
+    // D.i = (S0.u[23:0] * S1.u[23:0])>>32.
+    void
+    Inst_VOP2__V_MUL_HI_U32_U24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemU64 tmp_src0 = (VecElemU64)bits(src0[lane], 23, 0);
+                VecElemU64 tmp_src1 = (VecElemU64)bits(src1[lane], 23, 0);
+                vdst[lane] = (VecElemU32)((tmp_src0 * tmp_src1) >> 32);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MIN_F32 class methods ---
+
+    Inst_VOP2__V_MIN_F32::Inst_VOP2__V_MIN_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_min_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP2__V_MIN_F32
+
+    Inst_VOP2__V_MIN_F32::~Inst_VOP2__V_MIN_F32()
+    {
+    } // ~Inst_VOP2__V_MIN_F32
+
+    // --- description from .arch file ---
+    // D.f = (S0.f < S1.f ? S0.f : S1.f).
+    void
+    Inst_VOP2__V_MIN_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fmin(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MAX_F32 class methods ---
+
+    Inst_VOP2__V_MAX_F32::Inst_VOP2__V_MAX_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_max_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP2__V_MAX_F32
+
+    Inst_VOP2__V_MAX_F32::~Inst_VOP2__V_MAX_F32()
+    {
+    } // ~Inst_VOP2__V_MAX_F32
+
+    // --- description from .arch file ---
+    // D.f = (S0.f >= S1.f ? S0.f : S1.f).
+    void
+    Inst_VOP2__V_MAX_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fmax(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MIN_I32 class methods ---
+
+    Inst_VOP2__V_MIN_I32::Inst_VOP2__V_MIN_I32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_min_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MIN_I32
+
+    Inst_VOP2__V_MIN_I32::~Inst_VOP2__V_MIN_I32()
+    {
+    } // ~Inst_VOP2__V_MIN_I32
+
+    // --- description from .arch file ---
+    // D.i = min(S0.i, S1.i).
+    void
+    Inst_VOP2__V_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::min(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MAX_I32 class methods ---
+
+    Inst_VOP2__V_MAX_I32::Inst_VOP2__V_MAX_I32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_max_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MAX_I32
+
+    Inst_VOP2__V_MAX_I32::~Inst_VOP2__V_MAX_I32()
+    {
+    } // ~Inst_VOP2__V_MAX_I32
+
+    // --- description from .arch file ---
+    // D.i = max(S0.i, S1.i).
+    void
+    Inst_VOP2__V_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::max(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MIN_U32 class methods ---
+
+    Inst_VOP2__V_MIN_U32::Inst_VOP2__V_MIN_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_min_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MIN_U32
+
+    Inst_VOP2__V_MIN_U32::~Inst_VOP2__V_MIN_U32()
+    {
+    } // ~Inst_VOP2__V_MIN_U32
+
+    // --- description from .arch file ---
+    // D.u = min(S0.u, S1.u).
+    void
+    Inst_VOP2__V_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::min(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MAX_U32 class methods ---
+
+    Inst_VOP2__V_MAX_U32::Inst_VOP2__V_MAX_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_max_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MAX_U32
+
+    Inst_VOP2__V_MAX_U32::~Inst_VOP2__V_MAX_U32()
+    {
+    } // ~Inst_VOP2__V_MAX_U32
+
+    // --- description from .arch file ---
+    // D.u = max(S0.u, S1.u).
+    void
+    Inst_VOP2__V_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::max(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_LSHRREV_B32 class methods ---
+
+    Inst_VOP2__V_LSHRREV_B32::Inst_VOP2__V_LSHRREV_B32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_lshrrev_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_LSHRREV_B32
+
+    Inst_VOP2__V_LSHRREV_B32::~Inst_VOP2__V_LSHRREV_B32()
+    {
+    } // ~Inst_VOP2__V_LSHRREV_B32
+
+    // --- description from .arch file ---
+    // D.u = S1.u >> S0.u[4:0].
+    // The vacated bits are set to zero.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP2__V_LSHRREV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_ASHRREV_I32 class methods ---
+
+    Inst_VOP2__V_ASHRREV_I32::Inst_VOP2__V_ASHRREV_I32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_ashrrev_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_ASHRREV_I32
+
+    Inst_VOP2__V_ASHRREV_I32::~Inst_VOP2__V_ASHRREV_I32()
+    {
+    } // ~Inst_VOP2__V_ASHRREV_I32
+
+    // --- description from .arch file ---
+    // D.i = signext(S1.i) >> S0.i[4:0].
+    // The vacated bits are set to the sign bit of the input value.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP2__V_ASHRREV_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_LSHLREV_B32 class methods ---
+
+    Inst_VOP2__V_LSHLREV_B32::Inst_VOP2__V_LSHLREV_B32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_lshlrev_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_LSHLREV_B32
+
+    Inst_VOP2__V_LSHLREV_B32::~Inst_VOP2__V_LSHLREV_B32()
+    {
+    } // ~Inst_VOP2__V_LSHLREV_B32
+
+    // --- description from .arch file ---
+    // D.u = S1.u << S0.u[4:0].
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP2__V_LSHLREV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        if (isSDWAInst()) {
+            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
+            // use copies of original src0, src1, and vdst during selecting
+            VecOperandU32 origSrc0_sdwa(gpuDynInst,
+                                        extData.iFmt_VOP_SDWA.SRC0);
+            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
+            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
+
+            src0_sdwa.read();
+            origSrc0_sdwa.read();
+            origSrc1.read();
+
+            DPRINTF(VEGA, "Handling V_LSHLREV_B32 SRC SDWA. SRC0: register "
+                    "v[%d], DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: "
+                    "%d, SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: "
+                    "%d, SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
+                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
+                    extData.iFmt_VOP_SDWA.DST_U,
+                    extData.iFmt_VOP_SDWA.CLMP,
+                    extData.iFmt_VOP_SDWA.SRC0_SEL,
+                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
+                    extData.iFmt_VOP_SDWA.SRC0_NEG,
+                    extData.iFmt_VOP_SDWA.SRC0_ABS,
+                    extData.iFmt_VOP_SDWA.SRC1_SEL,
+                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
+                    extData.iFmt_VOP_SDWA.SRC1_NEG,
+                    extData.iFmt_VOP_SDWA.SRC1_ABS);
+
+            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
+                            src1, origSrc1);
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src1[lane] << bits(src0_sdwa[lane], 4, 0);
+                    origVdst[lane] = vdst[lane]; // keep copy consistent
+                }
+            }
+
+            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_AND_B32 class methods ---
+
+    Inst_VOP2__V_AND_B32::Inst_VOP2__V_AND_B32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_and_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_AND_B32
+
+    Inst_VOP2__V_AND_B32::~Inst_VOP2__V_AND_B32()
+    {
+    } // ~Inst_VOP2__V_AND_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u & S1.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP2__V_AND_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        if (isDPPInst()) {
+            VecOperandU32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
+            src0_dpp.read();
+
+            DPRINTF(VEGA, "Handling V_AND_B32 SRC DPP. SRC0: register v[%d], "
+                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
+                    "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
+                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
+                    extData.iFmt_VOP_DPP.DPP_CTRL,
+                    extData.iFmt_VOP_DPP.SRC0_ABS,
+                    extData.iFmt_VOP_DPP.SRC0_NEG,
+                    extData.iFmt_VOP_DPP.SRC1_ABS,
+                    extData.iFmt_VOP_DPP.SRC1_NEG,
+                    extData.iFmt_VOP_DPP.BC,
+                    extData.iFmt_VOP_DPP.BANK_MASK,
+                    extData.iFmt_VOP_DPP.ROW_MASK);
+
+            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0_dpp[lane] & src1[lane];
+                }
+            }
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0[lane] & src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_OR_B32 class methods ---
+
+    Inst_VOP2__V_OR_B32::Inst_VOP2__V_OR_B32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_or_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_OR_B32
+
+    Inst_VOP2__V_OR_B32::~Inst_VOP2__V_OR_B32()
+    {
+    } // ~Inst_VOP2__V_OR_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u | S1.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP2__V_OR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        if (isSDWAInst()) {
+            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
+            // use copies of original src0, src1, and dest during selecting
+            VecOperandU32 origSrc0_sdwa(gpuDynInst,
+                                        extData.iFmt_VOP_SDWA.SRC0);
+            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
+            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
+
+            src0_sdwa.read();
+            origSrc0_sdwa.read();
+            origSrc1.read();
+
+            DPRINTF(VEGA, "Handling V_OR_B32 SRC SDWA. SRC0: register v[%d], "
+                    "DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, "
+                    "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
+                    "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
+                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
+                    extData.iFmt_VOP_SDWA.DST_U,
+                    extData.iFmt_VOP_SDWA.CLMP,
+                    extData.iFmt_VOP_SDWA.SRC0_SEL,
+                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
+                    extData.iFmt_VOP_SDWA.SRC0_NEG,
+                    extData.iFmt_VOP_SDWA.SRC0_ABS,
+                    extData.iFmt_VOP_SDWA.SRC1_SEL,
+                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
+                    extData.iFmt_VOP_SDWA.SRC1_NEG,
+                    extData.iFmt_VOP_SDWA.SRC1_ABS);
+
+            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
+                            src1, origSrc1);
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0_sdwa[lane] | src1[lane];
+                    origVdst[lane] = vdst[lane]; // keep copy consistent
+                }
+            }
+
+            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0[lane] | src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_XOR_B32 class methods ---
+
+    Inst_VOP2__V_XOR_B32::Inst_VOP2__V_XOR_B32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_xor_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_XOR_B32
+
+    Inst_VOP2__V_XOR_B32::~Inst_VOP2__V_XOR_B32()
+    {
+    } // ~Inst_VOP2__V_XOR_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u ^ S1.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP2__V_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] ^ src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MAC_F32 class methods ---
+
+    Inst_VOP2__V_MAC_F32::Inst_VOP2__V_MAC_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mac_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(MAC);
+    } // Inst_VOP2__V_MAC_F32
+
+    Inst_VOP2__V_MAC_F32::~Inst_VOP2__V_MAC_F32()
+    {
+    } // ~Inst_VOP2__V_MAC_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f + D.f.
+    // SQ translates to V_MAD_F32.
+    void
+    Inst_VOP2__V_MAC_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        VecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+        vdst.read();
+
+        if (isDPPInst()) {
+            VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
+            src0_dpp.read();
+
+            DPRINTF(VEGA, "Handling V_MAC_F32 SRC DPP. SRC0: register v[%d], "
+                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
+                    "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
+                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
+                    extData.iFmt_VOP_DPP.DPP_CTRL,
+                    extData.iFmt_VOP_DPP.SRC0_ABS,
+                    extData.iFmt_VOP_DPP.SRC0_NEG,
+                    extData.iFmt_VOP_DPP.SRC1_ABS,
+                    extData.iFmt_VOP_DPP.SRC1_NEG,
+                    extData.iFmt_VOP_DPP.BC,
+                    extData.iFmt_VOP_DPP.BANK_MASK,
+                    extData.iFmt_VOP_DPP.ROW_MASK);
+
+            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = std::fma(src0_dpp[lane], src1[lane],
+                                          vdst[lane]);
+                }
+            }
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MADMK_F32 class methods ---
+
+    Inst_VOP2__V_MADMK_F32::Inst_VOP2__V_MADMK_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_madmk_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(MAD);
+    } // Inst_VOP2__V_MADMK_F32
+
+    Inst_VOP2__V_MADMK_F32::~Inst_VOP2__V_MADMK_F32()
+    {
+    } // ~Inst_VOP2__V_MADMK_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * K + S1.f; K is a 32-bit inline constant.
+    // This opcode cannot use the VOP3 encoding and cannot use input/output
+    // ---  modifiers.
+    // SQ translates to V_MAD_F32.
+    void
+    Inst_VOP2__V_MADMK_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+        VecElemF32 k = extData.imm_f32;
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fma(src0[lane], k, src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MADAK_F32 class methods ---
+
+    Inst_VOP2__V_MADAK_F32::Inst_VOP2__V_MADAK_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_madak_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(MAD);
+    } // Inst_VOP2__V_MADAK_F32
+
+    Inst_VOP2__V_MADAK_F32::~Inst_VOP2__V_MADAK_F32()
+    {
+    } // ~Inst_VOP2__V_MADAK_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f + K; K is a 32-bit inline constant.
+    // This opcode cannot use the VOP3 encoding and cannot use input/output
+    // ---  modifiers.
+    // SQ translates to V_MAD_F32.
+    void
+    Inst_VOP2__V_MADAK_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+        VecElemF32 k = extData.imm_f32;
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fma(src0[lane], src1[lane], k);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_ADD_CO_U32 class methods ---
+
+    Inst_VOP2__V_ADD_CO_U32::Inst_VOP2__V_ADD_CO_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_add_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+    } // Inst_VOP2__V_ADD_CO_U32
+
+    Inst_VOP2__V_ADD_CO_U32::~Inst_VOP2__V_ADD_CO_U32()
+    {
+    } // ~Inst_VOP2__V_ADD_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u + S1.u;
+    // VCC[threadId] = (S0.u + S1.u >= 0x800000000ULL ? 1 : 0) is an UNSIGNED
+    // ---  overflow or carry-out for V_ADDC_U32.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
+    void
+    Inst_VOP2__V_ADD_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        if (isSDWAInst()) {
+            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
+            // use copies of original src0, src1, and dest during selecting
+            VecOperandU32 origSrc0_sdwa(gpuDynInst,
+                                        extData.iFmt_VOP_SDWA.SRC0);
+            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
+            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
+
+            src0_sdwa.read();
+            origSrc0_sdwa.read();
+            origSrc1.read();
+
+            DPRINTF(VEGA, "Handling V_ADD_CO_U32 SRC SDWA. SRC0: register "
+                    "v[%d], DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, "
+                    "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
+                    "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
+                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
+                    extData.iFmt_VOP_SDWA.DST_U,
+                    extData.iFmt_VOP_SDWA.CLMP,
+                    extData.iFmt_VOP_SDWA.SRC0_SEL,
+                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
+                    extData.iFmt_VOP_SDWA.SRC0_NEG,
+                    extData.iFmt_VOP_SDWA.SRC0_ABS,
+                    extData.iFmt_VOP_SDWA.SRC1_SEL,
+                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
+                    extData.iFmt_VOP_SDWA.SRC1_NEG,
+                    extData.iFmt_VOP_SDWA.SRC1_ABS);
+
+            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
+                            src1, origSrc1);
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0_sdwa[lane] + src1[lane];
+                    origVdst[lane] = vdst[lane]; // keep copy consistent
+                    vcc.setBit(lane, ((VecElemU64)src0_sdwa[lane]
+                        + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
+                }
+            }
+
+            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0[lane] + src1[lane];
+                    vcc.setBit(lane, ((VecElemU64)src0[lane]
+                        + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
+                }
+            }
+        }
+
+        vcc.write();
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_SUB_CO_U32 class methods ---
+
+    Inst_VOP2__V_SUB_CO_U32::Inst_VOP2__V_SUB_CO_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_sub_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+    } // Inst_VOP2__V_SUB_CO_U32
+
+    Inst_VOP2__V_SUB_CO_U32::~Inst_VOP2__V_SUB_CO_U32()
+    {
+    } // ~Inst_VOP2__V_SUB_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u - S1.u;
+    // VCC[threadId] = (S1.u > S0.u ? 1 : 0) is an UNSIGNED overflow or
+    // carry-out for V_SUBB_U32.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
+    void
+    Inst_VOP2__V_SUB_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] - src1[lane];
+                vcc.setBit(lane, src1[lane] > src0[lane] ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        vcc.write();
+    } // execute
+    // --- Inst_VOP2__V_SUBREV_CO_U32 class methods ---
+
+    Inst_VOP2__V_SUBREV_CO_U32::Inst_VOP2__V_SUBREV_CO_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_subrev_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+    } // Inst_VOP2__V_SUBREV_CO_U32
+
+    Inst_VOP2__V_SUBREV_CO_U32::~Inst_VOP2__V_SUBREV_CO_U32()
+    {
+    } // ~Inst_VOP2__V_SUBREV_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S1.u - S0.u;
+    // VCC[threadId] = (S0.u > S1.u ? 1 : 0) is an UNSIGNED overflow or
+    // carry-out for V_SUBB_U32.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
+    void
+    Inst_VOP2__V_SUBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] - src0[lane];
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        vcc.write();
+    } // execute
+    // --- Inst_VOP2__V_ADDC_CO_U32 class methods ---
+
+    Inst_VOP2__V_ADDC_CO_U32::Inst_VOP2__V_ADDC_CO_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_addc_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(ReadsVCC);
+    } // Inst_VOP2__V_ADDC_CO_U32
+
+    Inst_VOP2__V_ADDC_CO_U32::~Inst_VOP2__V_ADDC_CO_U32()
+    {
+    } // ~Inst_VOP2__V_ADDC_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u + S1.u + VCC[threadId];
+    // VCC[threadId] = (S0.u + S1.u + VCC[threadId] >= 0x800000000ULL ? 1 : 0)
+    // is an UNSIGNED overflow.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
+    // source comes from the SGPR-pair at S2.u.
+    void
+    Inst_VOP2__V_ADDC_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+        vcc.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] + src1[lane]
+                    + bits(vcc.rawData(), lane);
+                vcc.setBit(lane, ((VecElemU64)src0[lane]
+                    + (VecElemU64)src1[lane]
+                        + (VecElemU64)bits(vcc.rawData(), lane, lane))
+                            >= 0x100000000 ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        vcc.write();
+    } // execute
+    // --- Inst_VOP2__V_SUBB_CO_U32 class methods ---
+
+    Inst_VOP2__V_SUBB_CO_U32::Inst_VOP2__V_SUBB_CO_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_subb_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(ReadsVCC);
+    } // Inst_VOP2__V_SUBB_CO_U32
+
+    Inst_VOP2__V_SUBB_CO_U32::~Inst_VOP2__V_SUBB_CO_U32()
+    {
+    } // ~Inst_VOP2__V_SUBB_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u - S1.u - VCC[threadId];
+    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
+    // ---  overflow.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
+    // ---  source comes from the SGPR-pair at S2.u.
+    void
+    Inst_VOP2__V_SUBB_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+        vcc.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane]
+                    = src0[lane] - src1[lane] - bits(vcc.rawData(), lane);
+                vcc.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
+                    > src0[lane] ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        vcc.write();
+    } // execute
+    // --- Inst_VOP2__V_SUBBREV_CO_U32 class methods ---
+
+    Inst_VOP2__V_SUBBREV_CO_U32::Inst_VOP2__V_SUBBREV_CO_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_subbrev_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(ReadsVCC);
+    } // Inst_VOP2__V_SUBBREV_CO_U32
+
+    Inst_VOP2__V_SUBBREV_CO_U32::~Inst_VOP2__V_SUBBREV_CO_U32()
+    {
+    } // ~Inst_VOP2__V_SUBBREV_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S1.u - S0.u - VCC[threadId];
+    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
+    // overflow.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
+    // source comes from the SGPR-pair at S2.u. SQ translates to V_SUBB_U32.
+    // SQ translates this to V_SUBREV_U32 with reversed operands.
+    void
+    Inst_VOP2__V_SUBBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+        vcc.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane]
+                    = src1[lane] - src0[lane] - bits(vcc.rawData(), lane);
+                vcc.setBit(lane, (src0[lane] + bits(vcc.rawData(), lane))
+                    > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        vcc.write();
+    } // execute
+    // --- Inst_VOP2__V_ADD_F16 class methods ---
+
+    Inst_VOP2__V_ADD_F16::Inst_VOP2__V_ADD_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_add_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP2__V_ADD_F16
+
+    Inst_VOP2__V_ADD_F16::~Inst_VOP2__V_ADD_F16()
+    {
+    } // ~Inst_VOP2__V_ADD_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 + S1.f16.
+    // Supports denormals, round mode, exception flags, saturation.
+    void
+    Inst_VOP2__V_ADD_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_SUB_F16 class methods ---
+
+    Inst_VOP2__V_SUB_F16::Inst_VOP2__V_SUB_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_sub_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP2__V_SUB_F16
+
+    Inst_VOP2__V_SUB_F16::~Inst_VOP2__V_SUB_F16()
+    {
+    } // ~Inst_VOP2__V_SUB_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 - S1.f16.
+    // Supports denormals, round mode, exception flags, saturation.
+    // SQ translates to V_ADD_F16.
+    void
+    Inst_VOP2__V_SUB_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_SUBREV_F16 class methods ---
+
+    Inst_VOP2__V_SUBREV_F16::Inst_VOP2__V_SUBREV_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_subrev_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP2__V_SUBREV_F16
+
+    Inst_VOP2__V_SUBREV_F16::~Inst_VOP2__V_SUBREV_F16()
+    {
+    } // ~Inst_VOP2__V_SUBREV_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S1.f16 - S0.f16.
+    // Supports denormals, round mode, exception flags, saturation.
+    // SQ translates to V_ADD_F16.
+    void
+    Inst_VOP2__V_SUBREV_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_MUL_F16 class methods ---
+
+    Inst_VOP2__V_MUL_F16::Inst_VOP2__V_MUL_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mul_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP2__V_MUL_F16
+
+    Inst_VOP2__V_MUL_F16::~Inst_VOP2__V_MUL_F16()
+    {
+    } // ~Inst_VOP2__V_MUL_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * S1.f16.
+    // Supports denormals, round mode, exception flags, saturation.
+    void
+    Inst_VOP2__V_MUL_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_MAC_F16 class methods ---
+
+    Inst_VOP2__V_MAC_F16::Inst_VOP2__V_MAC_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mac_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(MAC);
+    } // Inst_VOP2__V_MAC_F16
+
+    Inst_VOP2__V_MAC_F16::~Inst_VOP2__V_MAC_F16()
+    {
+    } // ~Inst_VOP2__V_MAC_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * S1.f16 + D.f16.
+    // Supports round mode, exception flags, saturation.
+    // SQ translates this to V_MAD_F16.
+    void
+    Inst_VOP2__V_MAC_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_MADMK_F16 class methods ---
+
+    Inst_VOP2__V_MADMK_F16::Inst_VOP2__V_MADMK_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_madmk_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(MAD);
+    } // Inst_VOP2__V_MADMK_F16
+
+    Inst_VOP2__V_MADMK_F16::~Inst_VOP2__V_MADMK_F16()
+    {
+    } // ~Inst_VOP2__V_MADMK_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * K.f16 + S1.f16; K is a 16-bit inline constant stored
+    // in the following literal DWORD.
+    // This opcode cannot use the VOP3 encoding and cannot use input/output
+    // modifiers. Supports round mode, exception flags, saturation.
+    // SQ translates this to V_MAD_F16.
+    void
+    Inst_VOP2__V_MADMK_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_MADAK_F16 class methods ---
+
+    Inst_VOP2__V_MADAK_F16::Inst_VOP2__V_MADAK_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_madak_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(MAD);
+    } // Inst_VOP2__V_MADAK_F16
+
+    Inst_VOP2__V_MADAK_F16::~Inst_VOP2__V_MADAK_F16()
+    {
+    } // ~Inst_VOP2__V_MADAK_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * S1.f16 + K.f16; K is a 16-bit inline constant stored
+    // in the following literal DWORD.
+    // This opcode cannot use the VOP3 encoding and cannot use input/output
+    // modifiers. Supports round mode, exception flags, saturation.
+    // SQ translates this to V_MAD_F16.
+    void
+    Inst_VOP2__V_MADAK_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_ADD_U16 class methods ---
+
+    Inst_VOP2__V_ADD_U16::Inst_VOP2__V_ADD_U16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_add_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_ADD_U16
+
+    Inst_VOP2__V_ADD_U16::~Inst_VOP2__V_ADD_U16()
+    {
+    } // ~Inst_VOP2__V_ADD_U16
+
+    // --- description from .arch file ---
+    // D.u16 = S0.u16 + S1.u16.
+    // Supports saturation (unsigned 16-bit integer domain).
+    void
+    Inst_VOP2__V_ADD_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] + src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_SUB_U16 class methods ---
+
+    Inst_VOP2__V_SUB_U16::Inst_VOP2__V_SUB_U16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_sub_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_SUB_U16
+
+    Inst_VOP2__V_SUB_U16::~Inst_VOP2__V_SUB_U16()
+    {
+    } // ~Inst_VOP2__V_SUB_U16
+
+    // --- description from .arch file ---
+    // D.u16 = S0.u16 - S1.u16.
+    // Supports saturation (unsigned 16-bit integer domain).
+    void
+    Inst_VOP2__V_SUB_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] - src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_SUBREV_U16 class methods ---
+
+    Inst_VOP2__V_SUBREV_U16::Inst_VOP2__V_SUBREV_U16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_subrev_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_SUBREV_U16
+
+    Inst_VOP2__V_SUBREV_U16::~Inst_VOP2__V_SUBREV_U16()
+    {
+    } // ~Inst_VOP2__V_SUBREV_U16
+
+    // --- description from .arch file ---
+    // D.u16 = S1.u16 - S0.u16.
+    // Supports saturation (unsigned 16-bit integer domain).
+    // SQ translates this to V_SUB_U16 with reversed operands.
+    void
+    Inst_VOP2__V_SUBREV_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] - src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MUL_LO_U16 class methods ---
+
+    Inst_VOP2__V_MUL_LO_U16::Inst_VOP2__V_MUL_LO_U16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mul_lo_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MUL_LO_U16
+
+    Inst_VOP2__V_MUL_LO_U16::~Inst_VOP2__V_MUL_LO_U16()
+    {
+    } // ~Inst_VOP2__V_MUL_LO_U16
+
+    // --- description from .arch file ---
+    // D.u16 = S0.u16 * S1.u16.
+    // Supports saturation (unsigned 16-bit integer domain).
+    void
+    Inst_VOP2__V_MUL_LO_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] * src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_LSHLREV_B16 class methods ---
+
+    Inst_VOP2__V_LSHLREV_B16::Inst_VOP2__V_LSHLREV_B16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_lshlrev_b16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_LSHLREV_B16
+
+    Inst_VOP2__V_LSHLREV_B16::~Inst_VOP2__V_LSHLREV_B16()
+    {
+    } // ~Inst_VOP2__V_LSHLREV_B16
+
+    // --- description from .arch file ---
+    // D.u[15:0] = S1.u[15:0] << S0.u[3:0].
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP2__V_LSHLREV_B16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] << bits(src0[lane], 3, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_LSHRREV_B16 class methods ---
+
+    Inst_VOP2__V_LSHRREV_B16::Inst_VOP2__V_LSHRREV_B16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_lshrrev_b16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_LSHRREV_B16
+
+    Inst_VOP2__V_LSHRREV_B16::~Inst_VOP2__V_LSHRREV_B16()
+    {
+    } // ~Inst_VOP2__V_LSHRREV_B16
+
+    // --- description from .arch file ---
+    // D.u[15:0] = S1.u[15:0] >> S0.u[3:0].
+    // The vacated bits are set to zero.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP2__V_LSHRREV_B16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] >> src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_ASHRREV_I16 class methods ---
+
+    Inst_VOP2__V_ASHRREV_I16::Inst_VOP2__V_ASHRREV_I16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_ashrrev_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_ASHRREV_I16
+
+    Inst_VOP2__V_ASHRREV_I16::~Inst_VOP2__V_ASHRREV_I16()
+    {
+    } // ~Inst_VOP2__V_ASHRREV_I16
+
+    // --- description from .arch file ---
+    // D.i[15:0] = signext(S1.i[15:0]) >> S0.i[3:0].
+    // The vacated bits are set to the sign bit of the input value.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP2__V_ASHRREV_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandI16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] >> src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MAX_F16 class methods ---
+
+    Inst_VOP2__V_MAX_F16::Inst_VOP2__V_MAX_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_max_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP2__V_MAX_F16
+
+    Inst_VOP2__V_MAX_F16::~Inst_VOP2__V_MAX_F16()
+    {
+    } // ~Inst_VOP2__V_MAX_F16
+
+    // --- description from .arch file ---
+    // D.f16 = max(S0.f16, S1.f16).
+    // IEEE compliant. Supports denormals, round mode, exception flags,
+    // saturation.
+    void
+    Inst_VOP2__V_MAX_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_MIN_F16 class methods ---
+
+    Inst_VOP2__V_MIN_F16::Inst_VOP2__V_MIN_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_min_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP2__V_MIN_F16
+
+    Inst_VOP2__V_MIN_F16::~Inst_VOP2__V_MIN_F16()
+    {
+    } // ~Inst_VOP2__V_MIN_F16
+
+    // --- description from .arch file ---
+    // D.f16 = min(S0.f16, S1.f16).
+    // IEEE compliant. Supports denormals, round mode, exception flags,
+    // saturation.
+    void
+    Inst_VOP2__V_MIN_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_MAX_U16 class methods ---
+
+    Inst_VOP2__V_MAX_U16::Inst_VOP2__V_MAX_U16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_max_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MAX_U16
+
+    Inst_VOP2__V_MAX_U16::~Inst_VOP2__V_MAX_U16()
+    {
+    } // ~Inst_VOP2__V_MAX_U16
+
+    // --- description from .arch file ---
+    // D.u[15:0] = max(S0.u[15:0], S1.u[15:0]).
+    void
+    Inst_VOP2__V_MAX_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::max(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MAX_I16 class methods ---
+
+    Inst_VOP2__V_MAX_I16::Inst_VOP2__V_MAX_I16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_max_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MAX_I16
+
+    Inst_VOP2__V_MAX_I16::~Inst_VOP2__V_MAX_I16()
+    {
+    } // ~Inst_VOP2__V_MAX_I16
+
+    // --- description from .arch file ---
+    // D.i[15:0] = max(S0.i[15:0], S1.i[15:0]).
+    void
+    Inst_VOP2__V_MAX_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandI16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::max(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MIN_U16 class methods ---
+
+    Inst_VOP2__V_MIN_U16::Inst_VOP2__V_MIN_U16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_min_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MIN_U16
+
+    Inst_VOP2__V_MIN_U16::~Inst_VOP2__V_MIN_U16()
+    {
+    } // ~Inst_VOP2__V_MIN_U16
+
+    // --- description from .arch file ---
+    // D.u[15:0] = min(S0.u[15:0], S1.u[15:0]).
+    void
+    Inst_VOP2__V_MIN_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::min(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MIN_I16 class methods ---
+
+    Inst_VOP2__V_MIN_I16::Inst_VOP2__V_MIN_I16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_min_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MIN_I16
+
+    Inst_VOP2__V_MIN_I16::~Inst_VOP2__V_MIN_I16()
+    {
+    } // ~Inst_VOP2__V_MIN_I16
+
+    // --- description from .arch file ---
+    // D.i[15:0] = min(S0.i[15:0], S1.i[15:0]).
+    void
+    Inst_VOP2__V_MIN_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandI16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::min(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_LDEXP_F16 class methods ---
+
+    Inst_VOP2__V_LDEXP_F16::Inst_VOP2__V_LDEXP_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_ldexp_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP2__V_LDEXP_F16
+
+    Inst_VOP2__V_LDEXP_F16::~Inst_VOP2__V_LDEXP_F16()
+    {
+    } // ~Inst_VOP2__V_LDEXP_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * (2 ** S1.i16).
+    void
+    Inst_VOP2__V_LDEXP_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_ADD_U32 class methods ---
+
+    Inst_VOP2__V_ADD_U32::Inst_VOP2__V_ADD_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_add_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_ADD_U32
+
+    Inst_VOP2__V_ADD_U32::~Inst_VOP2__V_ADD_U32()
+    {
+    } // ~Inst_VOP2__V_ADD_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u + S1.u;
+    void
+    Inst_VOP2__V_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        if (isSDWAInst()) {
+            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
+            // use copies of original src0, src1, and dest during selecting
+            VecOperandU32 origSrc0_sdwa(gpuDynInst,
+                                        extData.iFmt_VOP_SDWA.SRC0);
+            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
+            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
+
+            src0_sdwa.read();
+            origSrc0_sdwa.read();
+            origSrc1.read();
+
+            DPRINTF(VEGA, "Handling V_ADD_U32 SRC SDWA. SRC0: register v[%d], "
+                    "DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, "
+                    "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
+                    "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
+                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
+                    extData.iFmt_VOP_SDWA.DST_U,
+                    extData.iFmt_VOP_SDWA.CLMP,
+                    extData.iFmt_VOP_SDWA.SRC0_SEL,
+                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
+                    extData.iFmt_VOP_SDWA.SRC0_NEG,
+                    extData.iFmt_VOP_SDWA.SRC0_ABS,
+                    extData.iFmt_VOP_SDWA.SRC1_SEL,
+                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
+                    extData.iFmt_VOP_SDWA.SRC1_NEG,
+                    extData.iFmt_VOP_SDWA.SRC1_ABS);
+
+            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
+                            src1, origSrc1);
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0_sdwa[lane] + src1[lane];
+                    origVdst[lane] = vdst[lane]; // keep copy consistent
+                }
+            }
+
+            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0[lane] + src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_SUB_U32 class methods ---
+
+    Inst_VOP2__V_SUB_U32::Inst_VOP2__V_SUB_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_sub_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_SUB_U32
+
+    Inst_VOP2__V_SUB_U32::~Inst_VOP2__V_SUB_U32()
+    {
+    } // ~Inst_VOP2__V_SUB_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u - S1.u;
+    void
+    Inst_VOP2__V_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] - src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_SUBREV_U32 class methods ---
+
+    Inst_VOP2__V_SUBREV_U32::Inst_VOP2__V_SUBREV_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_subrev_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_SUBREV_U32
+
+    Inst_VOP2__V_SUBREV_U32::~Inst_VOP2__V_SUBREV_U32()
+    {
+    } // ~Inst_VOP2__V_SUBREV_U32
+
+    // --- description from .arch file ---
+    // D.u = S1.u - S0.u;
+    void
+    Inst_VOP2__V_SUBREV_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] - src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_FMAC_F32 class methods ---
+
+    Inst_VOP2__V_FMAC_F32::Inst_VOP2__V_FMAC_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_fmac_f32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_FMAC_F32
+
+    Inst_VOP2__V_FMAC_F32::~Inst_VOP2__V_FMAC_F32()
+    {
+    } // ~Inst_VOP2__V_FMAC_F32
+
+    // --- description from .arch file ---
+    // D.u = S1.u - S0.u;
+    void
+    Inst_VOP2__V_FMAC_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+        vdst.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/vop3.cc b/src/arch/amdgpu/vega/insts/vop3.cc
new file mode 100644
index 0000000000..8f6794c9c2
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/vop3.cc
@@ -0,0 +1,8906 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/inst_util.hh"
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_VOP3__V_CNDMASK_B32 class methods ---
+
+    Inst_VOP3__V_CNDMASK_B32::Inst_VOP3__V_CNDMASK_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cndmask_b32", false)
+    {
+        setFlag(ALU);
+        setFlag(ReadsVCC);
+    } // Inst_VOP3__V_CNDMASK_B32
+
+    Inst_VOP3__V_CNDMASK_B32::~Inst_VOP3__V_CNDMASK_B32()
+    {
+    } // ~Inst_VOP3__V_CNDMASK_B32
+
+    // --- description from .arch file ---
+    // D.u = (VCC[i] ? S1.u : S0.u) (i = threadID in wave); VOP3: specify VCC
+    // as a scalar GPR in S2.
+    void
+    Inst_VOP3__V_CNDMASK_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        vcc.read();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = bits(vcc.rawData(), lane)
+                    ? src1[lane] : src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ADD_F32 class methods ---
+
+    Inst_VOP3__V_ADD_F32::Inst_VOP3__V_ADD_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_add_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_ADD_F32
+
+    Inst_VOP3__V_ADD_F32::~Inst_VOP3__V_ADD_F32()
+    {
+    } // ~Inst_VOP3__V_ADD_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f + S1.f.
+    void
+    Inst_VOP3__V_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] + src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SUB_F32 class methods ---
+
+    Inst_VOP3__V_SUB_F32::Inst_VOP3__V_SUB_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sub_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_SUB_F32
+
+    Inst_VOP3__V_SUB_F32::~Inst_VOP3__V_SUB_F32()
+    {
+    } // ~Inst_VOP3__V_SUB_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f - S1.f.
+    // SQ translates to V_ADD_F32.
+    void
+    Inst_VOP3__V_SUB_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] - src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SUBREV_F32 class methods ---
+
+    Inst_VOP3__V_SUBREV_F32::Inst_VOP3__V_SUBREV_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_subrev_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_SUBREV_F32
+
+    Inst_VOP3__V_SUBREV_F32::~Inst_VOP3__V_SUBREV_F32()
+    {
+    } // ~Inst_VOP3__V_SUBREV_F32
+
+    // --- description from .arch file ---
+    // D.f = S1.f - S0.f.
+    // SQ translates to V_ADD_F32.
+    void
+    Inst_VOP3__V_SUBREV_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] - src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_LEGACY_F32 class methods ---
+
+    Inst_VOP3__V_MUL_LEGACY_F32::Inst_VOP3__V_MUL_LEGACY_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_legacy_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_MUL_LEGACY_F32
+
+    Inst_VOP3__V_MUL_LEGACY_F32::~Inst_VOP3__V_MUL_LEGACY_F32()
+    {
+    } // ~Inst_VOP3__V_MUL_LEGACY_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f (DX9 rules, 0.0*x = 0.0).
+    void
+    Inst_VOP3__V_MUL_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isnan(src0[lane]) ||
+                    std::isnan(src1[lane])) {
+                    vdst[lane] = NAN;
+                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src0[lane]) == FP_ZERO) &&
+                           !std::signbit(src0[lane])) {
+                    if (std::isinf(src1[lane])) {
+                        vdst[lane] = NAN;
+                    } else if (!std::signbit(src1[lane])) {
+                        vdst[lane] = +0.0;
+                    } else {
+                        vdst[lane] = -0.0;
+                    }
+                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src0[lane]) == FP_ZERO) &&
+                           std::signbit(src0[lane])) {
+                    if (std::isinf(src1[lane])) {
+                        vdst[lane] = NAN;
+                    } else if (std::signbit(src1[lane])) {
+                        vdst[lane] = +0.0;
+                    } else {
+                        vdst[lane] = -0.0;
+                    }
+                } else if (std::isinf(src0[lane]) &&
+                           !std::signbit(src0[lane])) {
+                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src1[lane]) == FP_ZERO) {
+                        vdst[lane] = NAN;
+                    } else if (!std::signbit(src1[lane])) {
+                        vdst[lane] = +INFINITY;
+                    } else {
+                        vdst[lane] = -INFINITY;
+                    }
+                } else if (std::isinf(src0[lane]) &&
+                           std::signbit(src0[lane])) {
+                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src1[lane]) == FP_ZERO) {
+                        vdst[lane] = NAN;
+                    } else if (std::signbit(src1[lane])) {
+                        vdst[lane] = +INFINITY;
+                    } else {
+                        vdst[lane] = -INFINITY;
+                    }
+                } else {
+                    vdst[lane] = src0[lane] * src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_F32 class methods ---
+
+    Inst_VOP3__V_MUL_F32::Inst_VOP3__V_MUL_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_MUL_F32
+
+    Inst_VOP3__V_MUL_F32::~Inst_VOP3__V_MUL_F32()
+    {
+    } // ~Inst_VOP3__V_MUL_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f.
+    void
+    Inst_VOP3__V_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isnan(src0[lane]) ||
+                    std::isnan(src1[lane])) {
+                    vdst[lane] = NAN;
+                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src0[lane]) == FP_ZERO) &&
+                           !std::signbit(src0[lane])) {
+                    if (std::isinf(src1[lane])) {
+                        vdst[lane] = NAN;
+                    } else if (!std::signbit(src1[lane])) {
+                        vdst[lane] = +0.0;
+                    } else {
+                        vdst[lane] = -0.0;
+                    }
+                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src0[lane]) == FP_ZERO) &&
+                           std::signbit(src0[lane])) {
+                    if (std::isinf(src1[lane])) {
+                        vdst[lane] = NAN;
+                    } else if (std::signbit(src1[lane])) {
+                        vdst[lane] = +0.0;
+                    } else {
+                        vdst[lane] = -0.0;
+                    }
+                } else if (std::isinf(src0[lane]) &&
+                           !std::signbit(src0[lane])) {
+                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src1[lane]) == FP_ZERO) {
+                        vdst[lane] = NAN;
+                    } else if (!std::signbit(src1[lane])) {
+                        vdst[lane] = +INFINITY;
+                    } else {
+                        vdst[lane] = -INFINITY;
+                    }
+                } else if (std::isinf(src0[lane]) &&
+                           std::signbit(src0[lane])) {
+                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src1[lane]) == FP_ZERO) {
+                        vdst[lane] = NAN;
+                    } else if (std::signbit(src1[lane])) {
+                        vdst[lane] = +INFINITY;
+                    } else {
+                        vdst[lane] = -INFINITY;
+                    }
+                } else {
+                    vdst[lane] = src0[lane] * src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_I32_I24 class methods ---
+
+    Inst_VOP3__V_MUL_I32_I24::Inst_VOP3__V_MUL_I32_I24(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_i32_i24", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MUL_I32_I24
+
+    Inst_VOP3__V_MUL_I32_I24::~Inst_VOP3__V_MUL_I32_I24()
+    {
+    } // ~Inst_VOP3__V_MUL_I32_I24
+
+    // --- description from .arch file ---
+    // D.i = S0.i[23:0] * S1.i[23:0].
+    void
+    Inst_VOP3__V_MUL_I32_I24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
+                    * sext<24>(bits(src1[lane], 23, 0));
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_HI_I32_I24 class methods ---
+
+    Inst_VOP3__V_MUL_HI_I32_I24::Inst_VOP3__V_MUL_HI_I32_I24(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_hi_i32_i24", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MUL_HI_I32_I24
+
+    Inst_VOP3__V_MUL_HI_I32_I24::~Inst_VOP3__V_MUL_HI_I32_I24()
+    {
+    } // ~Inst_VOP3__V_MUL_HI_I32_I24
+
+    // --- description from .arch file ---
+    // D.i = (S0.i[23:0] * S1.i[23:0])>>32.
+    void
+    Inst_VOP3__V_MUL_HI_I32_I24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemI64 tmp_src0
+                    = (VecElemI64)sext<24>(bits(src0[lane], 23, 0));
+                VecElemI64 tmp_src1
+                    = (VecElemI64)sext<24>(bits(src1[lane], 23, 0));
+
+                vdst[lane] = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_U32_U24 class methods ---
+
+    Inst_VOP3__V_MUL_U32_U24::Inst_VOP3__V_MUL_U32_U24(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_u32_u24", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MUL_U32_U24
+
+    Inst_VOP3__V_MUL_U32_U24::~Inst_VOP3__V_MUL_U32_U24()
+    {
+    } // ~Inst_VOP3__V_MUL_U32_U24
+
+    // --- description from .arch file ---
+    // D.u = S0.u[23:0] * S1.u[23:0].
+    void
+    Inst_VOP3__V_MUL_U32_U24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_HI_U32_U24 class methods ---
+
+    Inst_VOP3__V_MUL_HI_U32_U24::Inst_VOP3__V_MUL_HI_U32_U24(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_hi_u32_u24", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MUL_HI_U32_U24
+
+    Inst_VOP3__V_MUL_HI_U32_U24::~Inst_VOP3__V_MUL_HI_U32_U24()
+    {
+    } // ~Inst_VOP3__V_MUL_HI_U32_U24
+
+    // --- description from .arch file ---
+    // D.i = (S0.u[23:0] * S1.u[23:0])>>32.
+    void
+    Inst_VOP3__V_MUL_HI_U32_U24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemU64 tmp_src0 = (VecElemU64)bits(src0[lane], 23, 0);
+                VecElemU64 tmp_src1 = (VecElemU64)bits(src1[lane], 23, 0);
+                vdst[lane] = (VecElemU32)((tmp_src0 * tmp_src1) >> 32);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MIN_F32 class methods ---
+
+    Inst_VOP3__V_MIN_F32::Inst_VOP3__V_MIN_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_MIN_F32
+
+    Inst_VOP3__V_MIN_F32::~Inst_VOP3__V_MIN_F32()
+    {
+    } // ~Inst_VOP3__V_MIN_F32
+
+    // --- description from .arch file ---
+    // D.f = (S0.f < S1.f ? S0.f : S1.f).
+    void
+    Inst_VOP3__V_MIN_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fmin(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAX_F32 class methods ---
+
+    Inst_VOP3__V_MAX_F32::Inst_VOP3__V_MAX_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_MAX_F32
+
+    Inst_VOP3__V_MAX_F32::~Inst_VOP3__V_MAX_F32()
+    {
+    } // ~Inst_VOP3__V_MAX_F32
+
+    // --- description from .arch file ---
+    // D.f = (S0.f >= S1.f ? S0.f : S1.f).
+    void
+    Inst_VOP3__V_MAX_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fmax(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MIN_I32 class methods ---
+
+    Inst_VOP3__V_MIN_I32::Inst_VOP3__V_MIN_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MIN_I32
+
+    Inst_VOP3__V_MIN_I32::~Inst_VOP3__V_MIN_I32()
+    {
+    } // ~Inst_VOP3__V_MIN_I32
+
+    // --- description from .arch file ---
+    // D.i = min(S0.i, S1.i).
+    void
+    Inst_VOP3__V_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::min(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAX_I32 class methods ---
+
+    Inst_VOP3__V_MAX_I32::Inst_VOP3__V_MAX_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MAX_I32
+
+    Inst_VOP3__V_MAX_I32::~Inst_VOP3__V_MAX_I32()
+    {
+    } // ~Inst_VOP3__V_MAX_I32
+
+    // --- description from .arch file ---
+    // D.i = max(S0.i, S1.i).
+    void
+    Inst_VOP3__V_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::max(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MIN_U32 class methods ---
+
+    Inst_VOP3__V_MIN_U32::Inst_VOP3__V_MIN_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MIN_U32
+
+    Inst_VOP3__V_MIN_U32::~Inst_VOP3__V_MIN_U32()
+    {
+    } // ~Inst_VOP3__V_MIN_U32
+
+    // --- description from .arch file ---
+    // D.u = min(S0.u, S1.u).
+    void
+    Inst_VOP3__V_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::min(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAX_U32 class methods ---
+
+    Inst_VOP3__V_MAX_U32::Inst_VOP3__V_MAX_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MAX_U32
+
+    Inst_VOP3__V_MAX_U32::~Inst_VOP3__V_MAX_U32()
+    {
+    } // ~Inst_VOP3__V_MAX_U32
+
+    // --- description from .arch file ---
+    // D.u = max(S0.u, S1.u).
+    void
+    Inst_VOP3__V_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::max(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LSHRREV_B32 class methods ---
+
+    Inst_VOP3__V_LSHRREV_B32::Inst_VOP3__V_LSHRREV_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lshrrev_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LSHRREV_B32
+
+    Inst_VOP3__V_LSHRREV_B32::~Inst_VOP3__V_LSHRREV_B32()
+    {
+    } // ~Inst_VOP3__V_LSHRREV_B32
+
+    // --- description from .arch file ---
+    // D.u = S1.u >> S0.u[4:0].
+    // The vacated bits are set to zero.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP3__V_LSHRREV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ASHRREV_I32 class methods ---
+
+    Inst_VOP3__V_ASHRREV_I32::Inst_VOP3__V_ASHRREV_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ashrrev_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_ASHRREV_I32
+
+    Inst_VOP3__V_ASHRREV_I32::~Inst_VOP3__V_ASHRREV_I32()
+    {
+    } // ~Inst_VOP3__V_ASHRREV_I32
+
+    // --- description from .arch file ---
+    // D.i = signext(S1.i) >> S0.i[4:0].
+    // The vacated bits are set to the sign bit of the input value.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP3__V_ASHRREV_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LSHLREV_B32 class methods ---
+
+    Inst_VOP3__V_LSHLREV_B32::Inst_VOP3__V_LSHLREV_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lshlrev_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LSHLREV_B32
+
+    Inst_VOP3__V_LSHLREV_B32::~Inst_VOP3__V_LSHLREV_B32()
+    {
+    } // ~Inst_VOP3__V_LSHLREV_B32
+
+    // --- description from .arch file ---
+    // D.u = S1.u << S0.u[4:0].
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP3__V_LSHLREV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_AND_B32 class methods ---
+
+    Inst_VOP3__V_AND_B32::Inst_VOP3__V_AND_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_and_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_AND_B32
+
+    Inst_VOP3__V_AND_B32::~Inst_VOP3__V_AND_B32()
+    {
+    } // ~Inst_VOP3__V_AND_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u & S1.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP3__V_AND_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] & src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_OR_B32 class methods ---
+
+    Inst_VOP3__V_OR_B32::Inst_VOP3__V_OR_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_or_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_OR_B32
+
+    Inst_VOP3__V_OR_B32::~Inst_VOP3__V_OR_B32()
+    {
+    } // ~Inst_VOP3__V_OR_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u | S1.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP3__V_OR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] | src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_OR3_B32 class methods ---
+
+    Inst_VOP3__V_OR3_B32::Inst_VOP3__V_OR3_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_or3_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_OR3_B32
+
+    Inst_VOP3__V_OR3_B32::~Inst_VOP3__V_OR3_B32()
+    {
+    } // ~Inst_VOP3__V_OR3_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u | S1.u | S2.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP3__V_OR3_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] | src1[lane] | src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_XOR_B32 class methods ---
+
+    Inst_VOP3__V_XOR_B32::Inst_VOP3__V_XOR_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_xor_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_XOR_B32
+
+    Inst_VOP3__V_XOR_B32::~Inst_VOP3__V_XOR_B32()
+    {
+    } // ~Inst_VOP3__V_XOR_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u ^ S1.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP3__V_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] ^ src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAC_F32 class methods ---
+
+    Inst_VOP3__V_MAC_F32::Inst_VOP3__V_MAC_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mac_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(MAC);
+    } // Inst_VOP3__V_MAC_F32
+
+    Inst_VOP3__V_MAC_F32::~Inst_VOP3__V_MAC_F32()
+    {
+    } // ~Inst_VOP3__V_MAC_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f + D.f.
+    // SQ translates to V_MAD_F32.
+    void
+    Inst_VOP3__V_MAC_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        vdst.read();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ADD_CO_U32 class methods ---
+
+    Inst_VOP3__V_ADD_CO_U32::Inst_VOP3__V_ADD_CO_U32(InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_add_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+    } // Inst_VOP3__V_ADD_CO_U32
+
+    Inst_VOP3__V_ADD_CO_U32::~Inst_VOP3__V_ADD_CO_U32()
+    {
+    } // ~Inst_VOP3__V_ADD_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u + S1.u;
+    // VCC[threadId] = (S0.u + S1.u >= 0x800000000ULL ? 1 : 0) is an UNSIGNED
+    // ---  overflow or carry-out for V_ADDC_U32.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
+    void
+    Inst_VOP3__V_ADD_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] + src1[lane];
+                vcc.setBit(lane, ((VecElemU64)src0[lane]
+                    + (VecElemU64)src1[lane]) >= 0x100000000ULL ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        vcc.write();
+    } // execute
+    // --- Inst_VOP3__V_SUB_CO_U32 class methods ---
+
+    Inst_VOP3__V_SUB_CO_U32::Inst_VOP3__V_SUB_CO_U32(InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_sub_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+    } // Inst_VOP3__V_SUB_CO_U32
+
+    Inst_VOP3__V_SUB_CO_U32::~Inst_VOP3__V_SUB_CO_U32()
+    {
+    } // ~Inst_VOP3__V_SUB_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u - S1.u;
+    // VCC[threadId] = (S1.u > S0.u ? 1 : 0) is an UNSIGNED overflow or
+    // carry-out for V_SUBB_U32.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
+    void
+    Inst_VOP3__V_SUB_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] - src1[lane];
+                vcc.setBit(lane, src1[lane] > src0[lane] ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        vcc.write();
+    } // execute
+    // --- Inst_VOP3__V_SUBREV_CO_U32 class methods ---
+
+    Inst_VOP3__V_SUBREV_CO_U32::Inst_VOP3__V_SUBREV_CO_U32(
+          InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_subrev_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+    } // Inst_VOP3__V_SUBREV_CO_U32
+
+    Inst_VOP3__V_SUBREV_CO_U32::~Inst_VOP3__V_SUBREV_CO_U32()
+    {
+    } // ~Inst_VOP3__V_SUBREV_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S1.u - S0.u;
+    // VCC[threadId] = (S0.u > S1.u ? 1 : 0) is an UNSIGNED overflow or
+    // carry-out for V_SUBB_U32.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
+    // SQ translates this to V_SUB_U32 with reversed operands.
+    void
+    Inst_VOP3__V_SUBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] - src0[lane];
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        vcc.write();
+    } // execute
+    // --- Inst_VOP3__V_ADDC_CO_U32 class methods ---
+
+    Inst_VOP3__V_ADDC_CO_U32::Inst_VOP3__V_ADDC_CO_U32(InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_addc_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(ReadsVCC);
+    } // Inst_VOP3__V_ADDC_CO_U32
+
+    Inst_VOP3__V_ADDC_CO_U32::~Inst_VOP3__V_ADDC_CO_U32()
+    {
+    } // ~Inst_VOP3__V_ADDC_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u + S1.u + VCC[threadId];
+    // VCC[threadId] = (S0.u + S1.u + VCC[threadId] >= 0x800000000ULL ? 1 : 0)
+    // is an UNSIGNED overflow.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
+    // source comes from the SGPR-pair at S2.u.
+    void
+    Inst_VOP3__V_ADDC_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        vcc.read();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] + src1[lane]
+                    + bits(vcc.rawData(), lane);
+                sdst.setBit(lane, ((VecElemU64)src0[lane]
+                    + (VecElemU64)src1[lane]
+                        + (VecElemU64)bits(vcc.rawData(), lane))
+                            >= 0x100000000 ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SUBB_CO_U32 class methods ---
+
+    Inst_VOP3__V_SUBB_CO_U32::Inst_VOP3__V_SUBB_CO_U32(InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_subb_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(ReadsVCC);
+    } // Inst_VOP3__V_SUBB_CO_U32
+
+    Inst_VOP3__V_SUBB_CO_U32::~Inst_VOP3__V_SUBB_CO_U32()
+    {
+    } // ~Inst_VOP3__V_SUBB_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u - S1.u - VCC[threadId];
+    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
+    // ---  overflow.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
+    // ---  source comes from the SGPR-pair at S2.u.
+    void
+    Inst_VOP3__V_SUBB_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        vcc.read();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] - src1[lane]
+                    - bits(vcc.rawData(), lane);
+                sdst.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
+                    > src0[lane] ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SUBBREV_CO_U32 class methods ---
+
+    Inst_VOP3__V_SUBBREV_CO_U32::Inst_VOP3__V_SUBBREV_CO_U32(
+          InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_subbrev_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(ReadsVCC);
+    } // Inst_VOP3__V_SUBBREV_CO_U32
+
+    Inst_VOP3__V_SUBBREV_CO_U32::~Inst_VOP3__V_SUBBREV_CO_U32()
+    {
+    } // ~Inst_VOP3__V_SUBBREV_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S1.u - S0.u - VCC[threadId];
+    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
+    // overflow.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
+    // source comes from the SGPR-pair at S2.u. SQ translates to V_SUBB_U32.
+    void
+    Inst_VOP3__V_SUBBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        vcc.read();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] - src0[lane]
+                    - bits(vcc.rawData(), lane);
+                sdst.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
+                    > src0[lane] ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ADD_F16 class methods ---
+
+    Inst_VOP3__V_ADD_F16::Inst_VOP3__V_ADD_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_add_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_ADD_F16
+
+    Inst_VOP3__V_ADD_F16::~Inst_VOP3__V_ADD_F16()
+    {
+    } // ~Inst_VOP3__V_ADD_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 + S1.f16.
+    // Supports denormals, round mode, exception flags, saturation.
+    void
+    Inst_VOP3__V_ADD_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_SUB_F16 class methods ---
+
+    Inst_VOP3__V_SUB_F16::Inst_VOP3__V_SUB_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sub_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_SUB_F16
+
+    Inst_VOP3__V_SUB_F16::~Inst_VOP3__V_SUB_F16()
+    {
+    } // ~Inst_VOP3__V_SUB_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 - S1.f16.
+    // Supports denormals, round mode, exception flags, saturation.
+    // SQ translates to V_ADD_F16.
+    void
+    Inst_VOP3__V_SUB_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_SUBREV_F16 class methods ---
+
+    Inst_VOP3__V_SUBREV_F16::Inst_VOP3__V_SUBREV_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_subrev_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_SUBREV_F16
+
+    Inst_VOP3__V_SUBREV_F16::~Inst_VOP3__V_SUBREV_F16()
+    {
+    } // ~Inst_VOP3__V_SUBREV_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S1.f16 - S0.f16.
+    // Supports denormals, round mode, exception flags, saturation.
+    // SQ translates to V_ADD_F16.
+    void
+    Inst_VOP3__V_SUBREV_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_MUL_F16 class methods ---
+
+    Inst_VOP3__V_MUL_F16::Inst_VOP3__V_MUL_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_MUL_F16
+
+    Inst_VOP3__V_MUL_F16::~Inst_VOP3__V_MUL_F16()
+    {
+    } // ~Inst_VOP3__V_MUL_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * S1.f16.
+    // Supports denormals, round mode, exception flags, saturation.
+    void
+    Inst_VOP3__V_MUL_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_MAC_F16 class methods ---
+
+    Inst_VOP3__V_MAC_F16::Inst_VOP3__V_MAC_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mac_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(MAC);
+    } // Inst_VOP3__V_MAC_F16
+
+    Inst_VOP3__V_MAC_F16::~Inst_VOP3__V_MAC_F16()
+    {
+    } // ~Inst_VOP3__V_MAC_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * S1.f16 + D.f16.
+    // Supports round mode, exception flags, saturation.
+    // SQ translates this to V_MAD_F16.
+    void
+    Inst_VOP3__V_MAC_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_ADD_U16 class methods ---
+
+    Inst_VOP3__V_ADD_U16::Inst_VOP3__V_ADD_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_add_u16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_ADD_U16
+
+    Inst_VOP3__V_ADD_U16::~Inst_VOP3__V_ADD_U16()
+    {
+    } // ~Inst_VOP3__V_ADD_U16
+
+    // --- description from .arch file ---
+    // D.u16 = S0.u16 + S1.u16.
+    // Supports saturation (unsigned 16-bit integer domain).
+    void
+    Inst_VOP3__V_ADD_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] + src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SUB_U16 class methods ---
+
+    Inst_VOP3__V_SUB_U16::Inst_VOP3__V_SUB_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sub_u16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_SUB_U16
+
+    Inst_VOP3__V_SUB_U16::~Inst_VOP3__V_SUB_U16()
+    {
+    } // ~Inst_VOP3__V_SUB_U16
+
+    // --- description from .arch file ---
+    // D.u16 = S0.u16 - S1.u16.
+    // Supports saturation (unsigned 16-bit integer domain).
+    void
+    Inst_VOP3__V_SUB_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] - src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SUBREV_U16 class methods ---
+
+    Inst_VOP3__V_SUBREV_U16::Inst_VOP3__V_SUBREV_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_subrev_u16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_SUBREV_U16
+
+    Inst_VOP3__V_SUBREV_U16::~Inst_VOP3__V_SUBREV_U16()
+    {
+    } // ~Inst_VOP3__V_SUBREV_U16
+
+    // --- description from .arch file ---
+    // D.u16 = S1.u16 - S0.u16.
+    // Supports saturation (unsigned 16-bit integer domain).
+    // SQ translates this to V_SUB_U16 with reversed operands.
+    void
+    Inst_VOP3__V_SUBREV_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] - src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_LO_U16 class methods ---
+
+    Inst_VOP3__V_MUL_LO_U16::Inst_VOP3__V_MUL_LO_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_lo_u16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MUL_LO_U16
+
+    Inst_VOP3__V_MUL_LO_U16::~Inst_VOP3__V_MUL_LO_U16()
+    {
+    } // ~Inst_VOP3__V_MUL_LO_U16
+
+    // --- description from .arch file ---
+    // D.u16 = S0.u16 * S1.u16.
+    // Supports saturation (unsigned 16-bit integer domain).
+    void
+    Inst_VOP3__V_MUL_LO_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] * src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LSHLREV_B16 class methods ---
+
+    Inst_VOP3__V_LSHLREV_B16::Inst_VOP3__V_LSHLREV_B16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lshlrev_b16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LSHLREV_B16
+
+    Inst_VOP3__V_LSHLREV_B16::~Inst_VOP3__V_LSHLREV_B16()
+    {
+    } // ~Inst_VOP3__V_LSHLREV_B16
+
+    // --- description from .arch file ---
+    // D.u[15:0] = S1.u[15:0] << S0.u[3:0].
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP3__V_LSHLREV_B16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] << bits(src0[lane], 3, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LSHRREV_B16 class methods ---
+
+    Inst_VOP3__V_LSHRREV_B16::Inst_VOP3__V_LSHRREV_B16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lshrrev_b16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LSHRREV_B16
+
+    Inst_VOP3__V_LSHRREV_B16::~Inst_VOP3__V_LSHRREV_B16()
+    {
+    } // ~Inst_VOP3__V_LSHRREV_B16
+
+    // --- description from .arch file ---
+    // D.u[15:0] = S1.u[15:0] >> S0.u[3:0].
+    // The vacated bits are set to zero.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP3__V_LSHRREV_B16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] >> bits(src0[lane], 3, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ASHRREV_I16 class methods ---
+
+    Inst_VOP3__V_ASHRREV_I16::Inst_VOP3__V_ASHRREV_I16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ashrrev_i16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_ASHRREV_I16
+
+    Inst_VOP3__V_ASHRREV_I16::~Inst_VOP3__V_ASHRREV_I16()
+    {
+    } // ~Inst_VOP3__V_ASHRREV_I16
+
+    // --- description from .arch file ---
+    // D.i[15:0] = signext(S1.i[15:0]) >> S0.i[3:0].
+    // The vacated bits are set to the sign bit of the input value.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP3__V_ASHRREV_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        VecOperandI16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] >> bits(src0[lane], 3, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAX_F16 class methods ---
+
+    Inst_VOP3__V_MAX_F16::Inst_VOP3__V_MAX_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_MAX_F16
+
+    Inst_VOP3__V_MAX_F16::~Inst_VOP3__V_MAX_F16()
+    {
+    } // ~Inst_VOP3__V_MAX_F16
+
+    // --- description from .arch file ---
+    // D.f16 = max(S0.f16, S1.f16).
+    // IEEE compliant. Supports denormals, round mode, exception flags,
+    // saturation.
+    void
+    Inst_VOP3__V_MAX_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_MIN_F16 class methods ---
+
+    Inst_VOP3__V_MIN_F16::Inst_VOP3__V_MIN_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_MIN_F16
+
+    Inst_VOP3__V_MIN_F16::~Inst_VOP3__V_MIN_F16()
+    {
+    } // ~Inst_VOP3__V_MIN_F16
+
+    // --- description from .arch file ---
+    // D.f16 = min(S0.f16, S1.f16).
+    // IEEE compliant. Supports denormals, round mode, exception flags,
+    // saturation.
+    void
+    Inst_VOP3__V_MIN_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_MAX_U16 class methods ---
+
+    Inst_VOP3__V_MAX_U16::Inst_VOP3__V_MAX_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max_u16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MAX_U16
+
+    Inst_VOP3__V_MAX_U16::~Inst_VOP3__V_MAX_U16()
+    {
+    } // ~Inst_VOP3__V_MAX_U16
+
+    // --- description from .arch file ---
+    // D.u[15:0] = max(S0.u[15:0], S1.u[15:0]).
+    void
+    Inst_VOP3__V_MAX_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::max(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAX_I16 class methods ---
+
+    Inst_VOP3__V_MAX_I16::Inst_VOP3__V_MAX_I16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max_i16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MAX_I16
+
+    Inst_VOP3__V_MAX_I16::~Inst_VOP3__V_MAX_I16()
+    {
+    } // ~Inst_VOP3__V_MAX_I16
+
+    // --- description from .arch file ---
+    // D.i[15:0] = max(S0.i[15:0], S1.i[15:0]).
+    void
+    Inst_VOP3__V_MAX_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        VecOperandI16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::max(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MIN_U16 class methods ---
+
+    Inst_VOP3__V_MIN_U16::Inst_VOP3__V_MIN_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min_u16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MIN_U16
+
+    Inst_VOP3__V_MIN_U16::~Inst_VOP3__V_MIN_U16()
+    {
+    } // ~Inst_VOP3__V_MIN_U16
+
+    // --- description from .arch file ---
+    // D.u[15:0] = min(S0.u[15:0], S1.u[15:0]).
+    void
+    Inst_VOP3__V_MIN_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::min(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MIN_I16 class methods ---
+
+    Inst_VOP3__V_MIN_I16::Inst_VOP3__V_MIN_I16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min_i16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MIN_I16
+
+    Inst_VOP3__V_MIN_I16::~Inst_VOP3__V_MIN_I16()
+    {
+    } // ~Inst_VOP3__V_MIN_I16
+
+    // --- description from .arch file ---
+    // D.i[15:0] = min(S0.i[15:0], S1.i[15:0]).
+    void
+    Inst_VOP3__V_MIN_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        VecOperandI16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::min(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LDEXP_F16 class methods ---
+
+    Inst_VOP3__V_LDEXP_F16::Inst_VOP3__V_LDEXP_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ldexp_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_LDEXP_F16
+
+    Inst_VOP3__V_LDEXP_F16::~Inst_VOP3__V_LDEXP_F16()
+    {
+    } // ~Inst_VOP3__V_LDEXP_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * (2 ** S1.i16).
+    void
+    Inst_VOP3__V_LDEXP_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_ADD_U32 class methods ---
+
+    Inst_VOP3__V_ADD_U32::Inst_VOP3__V_ADD_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_add_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_ADD_U32
+
+    Inst_VOP3__V_ADD_U32::~Inst_VOP3__V_ADD_U32()
+    {
+    } // ~Inst_VOP3__V_ADD_U32
+
+    // --- description from .arch file ---
+    // D.u32 = S0.u32 + S1.u32.
+    void
+    Inst_VOP3__V_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] + src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SUB_U32 class methods ---
+
+    Inst_VOP3__V_SUB_U32::Inst_VOP3__V_SUB_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sub_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_SUB_U32
+
+    Inst_VOP3__V_SUB_U32::~Inst_VOP3__V_SUB_U32()
+    {
+    } // ~Inst_VOP3__V_SUB_U32
+
+    // --- description from .arch file ---
+    // D.u32 = S0.u32 - S1.u32.
+    void
+    Inst_VOP3__V_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] - src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SUBREV_U32 class methods ---
+
+    Inst_VOP3__V_SUBREV_U32::Inst_VOP3__V_SUBREV_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_subrev_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_SUBREV_U32
+
+    Inst_VOP3__V_SUBREV_U32::~Inst_VOP3__V_SUBREV_U32()
+    {
+    } // ~Inst_VOP3__V_SUBREV_U32
+
+    // --- description from .arch file ---
+    // D.u32 = S1.u32 - S0.u32.
+    void
+    Inst_VOP3__V_SUBREV_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] - src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_NOP class methods ---
+
+    Inst_VOP3__V_NOP::Inst_VOP3__V_NOP(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_nop", false)
+    {
+        setFlag(Nop);
+        setFlag(ALU);
+    } // Inst_VOP3__V_NOP
+
+    Inst_VOP3__V_NOP::~Inst_VOP3__V_NOP()
+    {
+    } // ~Inst_VOP3__V_NOP
+
+    // --- description from .arch file ---
+    // Do nothing.
+    void
+    Inst_VOP3__V_NOP::execute(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_VOP3__V_MOV_B32 class methods ---
+
+    Inst_VOP3__V_MOV_B32::Inst_VOP3__V_MOV_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mov_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MOV_B32
+
+    Inst_VOP3__V_MOV_B32::~Inst_VOP3__V_MOV_B32()
+    {
+    } // ~Inst_VOP3__V_MOV_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u.
+    // Input and output modifiers not supported; this is an untyped operation.
+    void
+    Inst_VOP3__V_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_I32_F64 class methods ---
+
+    Inst_VOP3__V_CVT_I32_F64::Inst_VOP3__V_CVT_I32_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_i32_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CVT_I32_F64
+
+    Inst_VOP3__V_CVT_I32_F64::~Inst_VOP3__V_CVT_I32_F64()
+    {
+    } // ~Inst_VOP3__V_CVT_I32_F64
+
+    // --- description from .arch file ---
+    // D.i = (int)S0.d.
+    // Out-of-range floating point values (including infinity) saturate. NaN is
+    // ---  converted to 0.
+    void
+    Inst_VOP3__V_CVT_I32_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int exp;
+                std::frexp(src[lane],&exp);
+                if (std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else if (std::isinf(src[lane]) || exp > 30) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = INT_MIN;
+                    } else {
+                        vdst[lane] = INT_MAX;
+                    }
+                } else {
+                    vdst[lane] = (VecElemI32)src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F64_I32 class methods ---
+
+    Inst_VOP3__V_CVT_F64_I32::Inst_VOP3__V_CVT_F64_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f64_i32", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CVT_F64_I32
+
+    Inst_VOP3__V_CVT_F64_I32::~Inst_VOP3__V_CVT_F64_I32()
+    {
+    } // ~Inst_VOP3__V_CVT_F64_I32
+
+    // --- description from .arch file ---
+    // D.d = (double)S0.i.
+    void
+    Inst_VOP3__V_CVT_F64_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF64)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F32_I32 class methods ---
+
+    Inst_VOP3__V_CVT_F32_I32::Inst_VOP3__V_CVT_F32_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f32_i32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_F32_I32
+
+    Inst_VOP3__V_CVT_F32_I32::~Inst_VOP3__V_CVT_F32_I32()
+    {
+    } // ~Inst_VOP3__V_CVT_F32_I32
+
+    // --- description from .arch file ---
+    // D.f = (float)S0.i.
+    void
+    Inst_VOP3__V_CVT_F32_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        VecOperandI32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F32_U32 class methods ---
+
+    Inst_VOP3__V_CVT_F32_U32::Inst_VOP3__V_CVT_F32_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f32_u32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_F32_U32
+
+    Inst_VOP3__V_CVT_F32_U32::~Inst_VOP3__V_CVT_F32_U32()
+    {
+    } // ~Inst_VOP3__V_CVT_F32_U32
+
+    // --- description from .arch file ---
+    // D.f = (float)S0.u.
+    void
+    Inst_VOP3__V_CVT_F32_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_U32_F32 class methods ---
+
+    Inst_VOP3__V_CVT_U32_F32::Inst_VOP3__V_CVT_U32_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_u32_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_U32_F32
+
+    Inst_VOP3__V_CVT_U32_F32::~Inst_VOP3__V_CVT_U32_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_U32_F32
+
+    // --- description from .arch file ---
+    // D.u = (unsigned)S0.f.
+    // Out-of-range floating point values (including infinity) saturate. NaN is
+    // ---  converted to 0.
+    void
+    Inst_VOP3__V_CVT_U32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int exp;
+                std::frexp(src[lane],&exp);
+                if (std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else if (std::isinf(src[lane])) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = 0;
+                    } else {
+                        vdst[lane] = UINT_MAX;
+                    }
+                } else if (exp > 31) {
+                    vdst[lane] = UINT_MAX;
+                } else {
+                    vdst[lane] = (VecElemU32)src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_I32_F32 class methods ---
+
+    Inst_VOP3__V_CVT_I32_F32::Inst_VOP3__V_CVT_I32_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_i32_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_I32_F32
+
+    Inst_VOP3__V_CVT_I32_F32::~Inst_VOP3__V_CVT_I32_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_I32_F32
+
+    // --- description from .arch file ---
+    // D.i = (int)S0.f.
+    // Out-of-range floating point values (including infinity) saturate. NaN is
+    // ---  converted to 0.
+    void
+    Inst_VOP3__V_CVT_I32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int exp;
+                std::frexp(src[lane],&exp);
+                if (std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else if (std::isinf(src[lane]) || exp > 30) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = INT_MIN;
+                    } else {
+                        vdst[lane] = INT_MAX;
+                    }
+                } else {
+                    vdst[lane] = (VecElemI32)src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MOV_FED_B32 class methods ---
+
+    Inst_VOP3__V_MOV_FED_B32::Inst_VOP3__V_MOV_FED_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mov_fed_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MOV_FED_B32
+
+    Inst_VOP3__V_MOV_FED_B32::~Inst_VOP3__V_MOV_FED_B32()
+    {
+    } // ~Inst_VOP3__V_MOV_FED_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u;
+    // Introduce EDC double error upon write to dest vgpr without causing an
+    // ---  exception.
+    // Input and output modifiers not supported; this is an untyped operation.
+    void
+    Inst_VOP3__V_MOV_FED_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F16_F32 class methods ---
+
+    Inst_VOP3__V_CVT_F16_F32::Inst_VOP3__V_CVT_F16_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f16_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_F16_F32
+
+    Inst_VOP3__V_CVT_F16_F32::~Inst_VOP3__V_CVT_F16_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_F16_F32
+
+    // --- description from .arch file ---
+    // D.f16 = flt32_to_flt16(S0.f).
+    // Supports input modifiers and creates FP16 denormals when appropriate.
+    void
+    Inst_VOP3__V_CVT_F16_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F32_F16 class methods ---
+
+    Inst_VOP3__V_CVT_F32_F16::Inst_VOP3__V_CVT_F32_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f32_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_F32_F16
+
+    Inst_VOP3__V_CVT_F32_F16::~Inst_VOP3__V_CVT_F32_F16()
+    {
+    } // ~Inst_VOP3__V_CVT_F32_F16
+
+    // --- description from .arch file ---
+    // D.f = flt16_to_flt32(S0.f16).
+    // FP16 denormal inputs are always accepted.
+    void
+    Inst_VOP3__V_CVT_F32_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_RPI_I32_F32 class methods ---
+
+    Inst_VOP3__V_CVT_RPI_I32_F32::Inst_VOP3__V_CVT_RPI_I32_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_rpi_i32_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_RPI_I32_F32
+
+    Inst_VOP3__V_CVT_RPI_I32_F32::~Inst_VOP3__V_CVT_RPI_I32_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_RPI_I32_F32
+
+    // --- description from .arch file ---
+    // D.i = (int)floor(S0.f + 0.5).
+    void
+    Inst_VOP3__V_CVT_RPI_I32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemI32)std::floor(src[lane] + 0.5);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_FLR_I32_F32 class methods ---
+
+    Inst_VOP3__V_CVT_FLR_I32_F32::Inst_VOP3__V_CVT_FLR_I32_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_flr_i32_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_FLR_I32_F32
+
+    Inst_VOP3__V_CVT_FLR_I32_F32::~Inst_VOP3__V_CVT_FLR_I32_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_FLR_I32_F32
+
+    // --- description from .arch file ---
+    // D.i = (int)floor(S0.f).
+    void
+    Inst_VOP3__V_CVT_FLR_I32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemI32)std::floor(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_OFF_F32_I4 class methods ---
+
+    Inst_VOP3__V_CVT_OFF_F32_I4::Inst_VOP3__V_CVT_OFF_F32_I4(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_off_f32_i4", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_OFF_F32_I4
+
+    Inst_VOP3__V_CVT_OFF_F32_I4::~Inst_VOP3__V_CVT_OFF_F32_I4()
+    {
+    } // ~Inst_VOP3__V_CVT_OFF_F32_I4
+
+    // --- description from .arch file ---
+    // 4-bit signed int to 32-bit float. Used for interpolation in shader.
+    void
+    Inst_VOP3__V_CVT_OFF_F32_I4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        // Could not parse sq_uc.arch desc field
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F32_F64 class methods ---
+
+    Inst_VOP3__V_CVT_F32_F64::Inst_VOP3__V_CVT_F32_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f32_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CVT_F32_F64
+
+    Inst_VOP3__V_CVT_F32_F64::~Inst_VOP3__V_CVT_F32_F64()
+    {
+    } // ~Inst_VOP3__V_CVT_F32_F64
+
+    // --- description from .arch file ---
+    // D.f = (float)S0.d.
+    void
+    Inst_VOP3__V_CVT_F32_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F64_F32 class methods ---
+
+    Inst_VOP3__V_CVT_F64_F32::Inst_VOP3__V_CVT_F64_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f64_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CVT_F64_F32
+
+    Inst_VOP3__V_CVT_F64_F32::~Inst_VOP3__V_CVT_F64_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_F64_F32
+
+    // --- description from .arch file ---
+    // D.d = (double)S0.f.
+    void
+    Inst_VOP3__V_CVT_F64_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF64)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F32_UBYTE0 class methods ---
+
+    Inst_VOP3__V_CVT_F32_UBYTE0::Inst_VOP3__V_CVT_F32_UBYTE0(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte0", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_F32_UBYTE0
+
+    Inst_VOP3__V_CVT_F32_UBYTE0::~Inst_VOP3__V_CVT_F32_UBYTE0()
+    {
+    } // ~Inst_VOP3__V_CVT_F32_UBYTE0
+
+    // --- description from .arch file ---
+    // D.f = (float)(S0.u[7:0]).
+    void
+    Inst_VOP3__V_CVT_F32_UBYTE0::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)bits(src[lane], 7, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F32_UBYTE1 class methods ---
+
+    Inst_VOP3__V_CVT_F32_UBYTE1::Inst_VOP3__V_CVT_F32_UBYTE1(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte1", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_F32_UBYTE1
+
+    Inst_VOP3__V_CVT_F32_UBYTE1::~Inst_VOP3__V_CVT_F32_UBYTE1()
+    {
+    } // ~Inst_VOP3__V_CVT_F32_UBYTE1
+
+    // --- description from .arch file ---
+    // D.f = (float)(S0.u[15:8]).
+    void
+    Inst_VOP3__V_CVT_F32_UBYTE1::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)bits(src[lane], 15, 8);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F32_UBYTE2 class methods ---
+
+    Inst_VOP3__V_CVT_F32_UBYTE2::Inst_VOP3__V_CVT_F32_UBYTE2(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte2", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_F32_UBYTE2
+
+    Inst_VOP3__V_CVT_F32_UBYTE2::~Inst_VOP3__V_CVT_F32_UBYTE2()
+    {
+    } // ~Inst_VOP3__V_CVT_F32_UBYTE2
+
+    // --- description from .arch file ---
+    // D.f = (float)(S0.u[23:16]).
+    void
+    Inst_VOP3__V_CVT_F32_UBYTE2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)bits(src[lane], 23, 16);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F32_UBYTE3 class methods ---
+
+    Inst_VOP3__V_CVT_F32_UBYTE3::Inst_VOP3__V_CVT_F32_UBYTE3(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte3", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_F32_UBYTE3
+
+    Inst_VOP3__V_CVT_F32_UBYTE3::~Inst_VOP3__V_CVT_F32_UBYTE3()
+    {
+    } // ~Inst_VOP3__V_CVT_F32_UBYTE3
+
+    // --- description from .arch file ---
+    // D.f = (float)(S0.u[31:24]).
+    void
+    Inst_VOP3__V_CVT_F32_UBYTE3::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)bits(src[lane], 31, 24);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_U32_F64 class methods ---
+
+    Inst_VOP3__V_CVT_U32_F64::Inst_VOP3__V_CVT_U32_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_u32_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CVT_U32_F64
+
+    Inst_VOP3__V_CVT_U32_F64::~Inst_VOP3__V_CVT_U32_F64()
+    {
+    } // ~Inst_VOP3__V_CVT_U32_F64
+
+    // --- description from .arch file ---
+    // D.u = (unsigned)S0.d.
+    // Out-of-range floating point values (including infinity) saturate. NaN is
+    // ---  converted to 0.
+    void
+    Inst_VOP3__V_CVT_U32_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int exp;
+                std::frexp(src[lane],&exp);
+                if (std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else if (std::isinf(src[lane])) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = 0;
+                    } else {
+                        vdst[lane] = UINT_MAX;
+                    }
+                } else if (exp > 31) {
+                    vdst[lane] = UINT_MAX;
+                } else {
+                    vdst[lane] = (VecElemU32)src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F64_U32 class methods ---
+
+    Inst_VOP3__V_CVT_F64_U32::Inst_VOP3__V_CVT_F64_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f64_u32", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CVT_F64_U32
+
+    Inst_VOP3__V_CVT_F64_U32::~Inst_VOP3__V_CVT_F64_U32()
+    {
+    } // ~Inst_VOP3__V_CVT_F64_U32
+
+    // --- description from .arch file ---
+    // D.d = (double)S0.u.
+    void
+    Inst_VOP3__V_CVT_F64_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF64)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_TRUNC_F64 class methods ---
+
+    Inst_VOP3__V_TRUNC_F64::Inst_VOP3__V_TRUNC_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_trunc_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_TRUNC_F64
+
+    Inst_VOP3__V_TRUNC_F64::~Inst_VOP3__V_TRUNC_F64()
+    {
+    } // ~Inst_VOP3__V_TRUNC_F64
+
+    // --- description from .arch file ---
+    // D.d = trunc(S0.d), return integer part of S0.d.
+    void
+    Inst_VOP3__V_TRUNC_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::trunc(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CEIL_F64 class methods ---
+
+    Inst_VOP3__V_CEIL_F64::Inst_VOP3__V_CEIL_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ceil_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CEIL_F64
+
+    Inst_VOP3__V_CEIL_F64::~Inst_VOP3__V_CEIL_F64()
+    {
+    } // ~Inst_VOP3__V_CEIL_F64
+
+    // --- description from .arch file ---
+    // D.d = trunc(S0.d);
+    // if (S0.d > 0.0 && S0.d != D.d) then D.d += 1.0.
+    void
+    Inst_VOP3__V_CEIL_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::ceil(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_RNDNE_F64 class methods ---
+
+    Inst_VOP3__V_RNDNE_F64::Inst_VOP3__V_RNDNE_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rndne_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_RNDNE_F64
+
+    Inst_VOP3__V_RNDNE_F64::~Inst_VOP3__V_RNDNE_F64()
+    {
+    } // ~Inst_VOP3__V_RNDNE_F64
+
+    // --- description from .arch file ---
+    // D.d = round_nearest_even(S0.d).
+    void
+    Inst_VOP3__V_RNDNE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = roundNearestEven(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FLOOR_F64 class methods ---
+
+    Inst_VOP3__V_FLOOR_F64::Inst_VOP3__V_FLOOR_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_floor_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_FLOOR_F64
+
+    Inst_VOP3__V_FLOOR_F64::~Inst_VOP3__V_FLOOR_F64()
+    {
+    } // ~Inst_VOP3__V_FLOOR_F64
+
+    // --- description from .arch file ---
+    // D.d = trunc(S0.d);
+    // if (S0.d < 0.0 && S0.d != D.d) then D.d += -1.0.
+    void
+    Inst_VOP3__V_FLOOR_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::floor(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FRACT_F32 class methods ---
+
+    Inst_VOP3__V_FRACT_F32::Inst_VOP3__V_FRACT_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_fract_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_FRACT_F32
+
+    Inst_VOP3__V_FRACT_F32::~Inst_VOP3__V_FRACT_F32()
+    {
+    } // ~Inst_VOP3__V_FRACT_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f - floor(S0.f).
+    void
+    Inst_VOP3__V_FRACT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemF32 int_part(0.0);
+                vdst[lane] = std::modf(src[lane], &int_part);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_TRUNC_F32 class methods ---
+
+    Inst_VOP3__V_TRUNC_F32::Inst_VOP3__V_TRUNC_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_trunc_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_TRUNC_F32
+
+    Inst_VOP3__V_TRUNC_F32::~Inst_VOP3__V_TRUNC_F32()
+    {
+    } // ~Inst_VOP3__V_TRUNC_F32
+
+    // --- description from .arch file ---
+    // D.f = trunc(S0.f), return integer part of S0.f.
+    void
+    Inst_VOP3__V_TRUNC_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::trunc(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CEIL_F32 class methods ---
+
+    Inst_VOP3__V_CEIL_F32::Inst_VOP3__V_CEIL_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ceil_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CEIL_F32
+
+    Inst_VOP3__V_CEIL_F32::~Inst_VOP3__V_CEIL_F32()
+    {
+    } // ~Inst_VOP3__V_CEIL_F32
+
+    // --- description from .arch file ---
+    // D.f = trunc(S0.f);
+    // if (S0.f > 0.0 && S0.f != D.f) then D.f += 1.0.
+    void
+    Inst_VOP3__V_CEIL_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::ceil(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_RNDNE_F32 class methods ---
+
+    Inst_VOP3__V_RNDNE_F32::Inst_VOP3__V_RNDNE_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rndne_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_RNDNE_F32
+
+    Inst_VOP3__V_RNDNE_F32::~Inst_VOP3__V_RNDNE_F32()
+    {
+    } // ~Inst_VOP3__V_RNDNE_F32
+
+    // --- description from .arch file ---
+    // D.f = round_nearest_even(S0.f).
+    void
+    Inst_VOP3__V_RNDNE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = roundNearestEven(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FLOOR_F32 class methods ---
+
+    Inst_VOP3__V_FLOOR_F32::Inst_VOP3__V_FLOOR_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_floor_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_FLOOR_F32
+
+    Inst_VOP3__V_FLOOR_F32::~Inst_VOP3__V_FLOOR_F32()
+    {
+    } // ~Inst_VOP3__V_FLOOR_F32
+
+    // --- description from .arch file ---
+    // D.f = trunc(S0.f);
+    // if (S0.f < 0.0 && S0.f != D.f) then D.f += -1.0.
+    void
+    Inst_VOP3__V_FLOOR_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::floor(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_EXP_F32 class methods ---
+
+    Inst_VOP3__V_EXP_F32::Inst_VOP3__V_EXP_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_exp_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_EXP_F32
+
+    Inst_VOP3__V_EXP_F32::~Inst_VOP3__V_EXP_F32()
+    {
+    } // ~Inst_VOP3__V_EXP_F32
+
+    // --- description from .arch file ---
+    // D.f = pow(2.0, S0.f).
+    void
+    Inst_VOP3__V_EXP_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::pow(2.0, src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LOG_F32 class methods ---
+
+    Inst_VOP3__V_LOG_F32::Inst_VOP3__V_LOG_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_log_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_LOG_F32
+
+    Inst_VOP3__V_LOG_F32::~Inst_VOP3__V_LOG_F32()
+    {
+    } // ~Inst_VOP3__V_LOG_F32
+
+    // --- description from .arch file ---
+    // D.f = log2(S0.f). Base 2 logarithm.
+    void
+    Inst_VOP3__V_LOG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::log2(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_RCP_F32 class methods ---
+
+    Inst_VOP3__V_RCP_F32::Inst_VOP3__V_RCP_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rcp_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_RCP_F32
+
+    Inst_VOP3__V_RCP_F32::~Inst_VOP3__V_RCP_F32()
+    {
+    } // ~Inst_VOP3__V_RCP_F32
+
+    // --- description from .arch file ---
+    // D.f = 1.0 / S0.f. Reciprocal with IEEE rules and < 1ulp error.
+    void
+    Inst_VOP3__V_RCP_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = 1.0 / src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_RCP_IFLAG_F32 class methods ---
+
+    Inst_VOP3__V_RCP_IFLAG_F32::Inst_VOP3__V_RCP_IFLAG_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rcp_iflag_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_RCP_IFLAG_F32
+
+    Inst_VOP3__V_RCP_IFLAG_F32::~Inst_VOP3__V_RCP_IFLAG_F32()
+    {
+    } // ~Inst_VOP3__V_RCP_IFLAG_F32
+
+    // --- description from .arch file ---
+    // D.f = 1.0 / S0.f. Reciprocal intended for integer division, can raise
+    // ---  integer DIV_BY_ZERO exception but cannot raise floating-point
+    // ---  exceptions.
+    void
+    Inst_VOP3__V_RCP_IFLAG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = 1.0 / src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_RSQ_F32 class methods ---
+
+    Inst_VOP3__V_RSQ_F32::Inst_VOP3__V_RSQ_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rsq_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_RSQ_F32
+
+    Inst_VOP3__V_RSQ_F32::~Inst_VOP3__V_RSQ_F32()
+    {
+    } // ~Inst_VOP3__V_RSQ_F32
+
+    // --- description from .arch file ---
+    // D.f = 1.0 / sqrt(S0.f). Reciprocal square root with IEEE rules.
+    void
+    Inst_VOP3__V_RSQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = 1.0 / std::sqrt(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_RCP_F64 class methods ---
+
+    Inst_VOP3__V_RCP_F64::Inst_VOP3__V_RCP_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rcp_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_RCP_F64
+
+    Inst_VOP3__V_RCP_F64::~Inst_VOP3__V_RCP_F64()
+    {
+    } // ~Inst_VOP3__V_RCP_F64
+
+    // --- description from .arch file ---
+    // D.d = 1.0 / S0.d.
+    void
+    Inst_VOP3__V_RCP_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::fpclassify(src[lane]) == FP_ZERO) {
+                    vdst[lane] = +INFINITY;
+                } else if (std::isnan(src[lane])) {
+                    vdst[lane] = NAN;
+                } else if (std::isinf(src[lane])) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = -0.0;
+                    } else {
+                        vdst[lane] = 0.0;
+                    }
+                } else {
+                    vdst[lane] = 1.0 / src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_RSQ_F64 class methods ---
+
+    Inst_VOP3__V_RSQ_F64::Inst_VOP3__V_RSQ_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rsq_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_RSQ_F64
+
+    Inst_VOP3__V_RSQ_F64::~Inst_VOP3__V_RSQ_F64()
+    {
+    } // ~Inst_VOP3__V_RSQ_F64
+
+    // --- description from .arch file ---
+    // D.d = 1.0 / sqrt(S0.d). See V_RSQ_F32.
+    void
+    Inst_VOP3__V_RSQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::fpclassify(src[lane]) == FP_ZERO) {
+                    vdst[lane] = +INFINITY;
+                } else if (std::isnan(src[lane])) {
+                    vdst[lane] = NAN;
+                } else if (std::isinf(src[lane]) && !std::signbit(src[lane])) {
+                    vdst[lane] = 0.0;
+                } else if (std::signbit(src[lane])) {
+                    vdst[lane] = NAN;
+                } else {
+                    vdst[lane] = 1.0 / std::sqrt(src[lane]);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SQRT_F32 class methods ---
+
+    Inst_VOP3__V_SQRT_F32::Inst_VOP3__V_SQRT_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sqrt_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_SQRT_F32
+
+    Inst_VOP3__V_SQRT_F32::~Inst_VOP3__V_SQRT_F32()
+    {
+    } // ~Inst_VOP3__V_SQRT_F32
+
+    // --- description from .arch file ---
+    // D.f = sqrt(S0.f).
+    void
+    Inst_VOP3__V_SQRT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::sqrt(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SQRT_F64 class methods ---
+
+    Inst_VOP3__V_SQRT_F64::Inst_VOP3__V_SQRT_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sqrt_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_SQRT_F64
+
+    Inst_VOP3__V_SQRT_F64::~Inst_VOP3__V_SQRT_F64()
+    {
+    } // ~Inst_VOP3__V_SQRT_F64
+
+    // --- description from .arch file ---
+    // D.d = sqrt(S0.d).
+    void
+    Inst_VOP3__V_SQRT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::sqrt(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SIN_F32 class methods ---
+
+    Inst_VOP3__V_SIN_F32::Inst_VOP3__V_SIN_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sin_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_SIN_F32
+
+    Inst_VOP3__V_SIN_F32::~Inst_VOP3__V_SIN_F32()
+    {
+    } // ~Inst_VOP3__V_SIN_F32
+
+    // --- description from .arch file ---
+    // D.f = sin(S0.f * 2 * PI).
+    // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
+    // float 0.0.
+    void
+    Inst_VOP3__V_SIN_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+        pi.read();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::sin(src[lane] * 2 * pi.rawData());
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_COS_F32 class methods ---
+
+    Inst_VOP3__V_COS_F32::Inst_VOP3__V_COS_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cos_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_COS_F32
+
+    Inst_VOP3__V_COS_F32::~Inst_VOP3__V_COS_F32()
+    {
+    } // ~Inst_VOP3__V_COS_F32
+
+    // --- description from .arch file ---
+    // D.f = cos(S0.f * 2 * PI).
+    // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
+    // float 1.0.
+    void
+    Inst_VOP3__V_COS_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+        pi.read();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::cos(src[lane] * 2 * pi.rawData());
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_NOT_B32 class methods ---
+
+    Inst_VOP3__V_NOT_B32::Inst_VOP3__V_NOT_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_not_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_NOT_B32
+
+    Inst_VOP3__V_NOT_B32::~Inst_VOP3__V_NOT_B32()
+    {
+    } // ~Inst_VOP3__V_NOT_B32
+
+    // --- description from .arch file ---
+    // D.u = ~S0.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP3__V_NOT_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = ~src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_BFREV_B32 class methods ---
+
+    Inst_VOP3__V_BFREV_B32::Inst_VOP3__V_BFREV_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_bfrev_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_BFREV_B32
+
+    Inst_VOP3__V_BFREV_B32::~Inst_VOP3__V_BFREV_B32()
+    {
+    } // ~Inst_VOP3__V_BFREV_B32
+
+    // --- description from .arch file ---
+    // D.u[31:0] = S0.u[0:31], bitfield reverse.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP3__V_BFREV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = reverseBits(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FFBH_U32 class methods ---
+
+    Inst_VOP3__V_FFBH_U32::Inst_VOP3__V_FFBH_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ffbh_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_FFBH_U32
+
+    Inst_VOP3__V_FFBH_U32::~Inst_VOP3__V_FFBH_U32()
+    {
+    } // ~Inst_VOP3__V_FFBH_U32
+
+    // --- description from .arch file ---
+    // D.u = position of first 1 in S0.u from MSB;
+    // D.u = 0xffffffff if S0.u == 0.
+    void
+    Inst_VOP3__V_FFBH_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = findFirstOneMsb(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FFBL_B32 class methods ---
+
+    Inst_VOP3__V_FFBL_B32::Inst_VOP3__V_FFBL_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ffbl_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_FFBL_B32
+
+    Inst_VOP3__V_FFBL_B32::~Inst_VOP3__V_FFBL_B32()
+    {
+    } // ~Inst_VOP3__V_FFBL_B32
+
+    // --- description from .arch file ---
+    // D.u = position of first 1 in S0.u from LSB;
+    // D.u = 0xffffffff if S0.u == 0.
+    void
+    Inst_VOP3__V_FFBL_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = findFirstOne(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FFBH_I32 class methods ---
+
+    Inst_VOP3__V_FFBH_I32::Inst_VOP3__V_FFBH_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ffbh_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_FFBH_I32
+
+    Inst_VOP3__V_FFBH_I32::~Inst_VOP3__V_FFBH_I32()
+    {
+    } // ~Inst_VOP3__V_FFBH_I32
+
+    // --- description from .arch file ---
+    // D.u = position of first bit different from sign bit in S0.i from MSB;
+    // D.u = 0xffffffff if S0.i == 0 or S0.i == 0xffffffff.
+    void
+    Inst_VOP3__V_FFBH_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = firstOppositeSignBit(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FREXP_EXP_I32_F64 class methods ---
+
+    Inst_VOP3__V_FREXP_EXP_I32_F64::Inst_VOP3__V_FREXP_EXP_I32_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_frexp_exp_i32_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_FREXP_EXP_I32_F64
+
+    Inst_VOP3__V_FREXP_EXP_I32_F64::~Inst_VOP3__V_FREXP_EXP_I32_F64()
+    {
+    } // ~Inst_VOP3__V_FREXP_EXP_I32_F64
+
+    // --- description from .arch file ---
+    // See V_FREXP_EXP_I32_F32.
+    void
+    Inst_VOP3__V_FREXP_EXP_I32_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else {
+                    VecElemI32 exp(0);
+                    std::frexp(src[lane], &exp);
+                    vdst[lane] = exp;
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FREXP_MANT_F64 class methods ---
+
+    Inst_VOP3__V_FREXP_MANT_F64::Inst_VOP3__V_FREXP_MANT_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_frexp_mant_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_FREXP_MANT_F64
+
+    Inst_VOP3__V_FREXP_MANT_F64::~Inst_VOP3__V_FREXP_MANT_F64()
+    {
+    } // ~Inst_VOP3__V_FREXP_MANT_F64
+
+    // --- description from .arch file ---
+    // See V_FREXP_MANT_F32.
+    void
+    Inst_VOP3__V_FREXP_MANT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemI32 exp(0);
+                vdst[lane] = std::frexp(src[lane], &exp);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FRACT_F64 class methods ---
+
+    Inst_VOP3__V_FRACT_F64::Inst_VOP3__V_FRACT_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_fract_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_FRACT_F64
+
+    Inst_VOP3__V_FRACT_F64::~Inst_VOP3__V_FRACT_F64()
+    {
+    } // ~Inst_VOP3__V_FRACT_F64
+
+    // --- description from .arch file ---
+    // See V_FRACT_F32.
+    void
+    Inst_VOP3__V_FRACT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemF32 int_part(0.0);
+                vdst[lane] = std::modf(src[lane], &int_part);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FREXP_EXP_I32_F32 class methods ---
+
+    Inst_VOP3__V_FREXP_EXP_I32_F32::Inst_VOP3__V_FREXP_EXP_I32_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_frexp_exp_i32_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_FREXP_EXP_I32_F32
+
+    Inst_VOP3__V_FREXP_EXP_I32_F32::~Inst_VOP3__V_FREXP_EXP_I32_F32()
+    {
+    } // ~Inst_VOP3__V_FREXP_EXP_I32_F32
+
+    // --- description from .arch file ---
+    // if (S0.f == INF || S0.f == NAN) then D.i = 0;
+    // else D.i = TwosComplement(Exponent(S0.f) - 127 + 1).
+    // Returns exponent of single precision float input, such that S0.f =
+    // significand * (2 ** exponent). See also FREXP_MANT_F32, which returns
+    // the significand.
+    void
+    Inst_VOP3__V_FREXP_EXP_I32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isinf(src[lane])|| std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else {
+                    VecElemI32 exp(0);
+                    std::frexp(src[lane], &exp);
+                    vdst[lane] = exp;
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FREXP_MANT_F32 class methods ---
+
+    Inst_VOP3__V_FREXP_MANT_F32::Inst_VOP3__V_FREXP_MANT_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_frexp_mant_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_FREXP_MANT_F32
+
+    Inst_VOP3__V_FREXP_MANT_F32::~Inst_VOP3__V_FREXP_MANT_F32()
+    {
+    } // ~Inst_VOP3__V_FREXP_MANT_F32
+
+    // --- description from .arch file ---
+    // if (S0.f == INF || S0.f == NAN) then D.f = S0.f;
+    // else D.f = Mantissa(S0.f).
+    // Result range is in (-1.0,-0.5][0.5,1.0) in normal cases. Returns binary
+    // ---  significand of single precision float input, such that S0.f =
+    // ---  significand * (2 ** exponent). See also FREXP_EXP_I32_F32, which
+    // ---  returns integer exponent.
+    void
+    Inst_VOP3__V_FREXP_MANT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
+                    vdst[lane] = src[lane];
+                } else {
+                    VecElemI32 exp(0);
+                    vdst[lane] = std::frexp(src[lane], &exp);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CLREXCP class methods ---
+
+    Inst_VOP3__V_CLREXCP::Inst_VOP3__V_CLREXCP(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_clrexcp", false)
+    {
+    } // Inst_VOP3__V_CLREXCP
+
+    Inst_VOP3__V_CLREXCP::~Inst_VOP3__V_CLREXCP()
+    {
+    } // ~Inst_VOP3__V_CLREXCP
+
+    // --- description from .arch file ---
+    // Clear wave's exception state in SIMD (SP).
+    void
+    Inst_VOP3__V_CLREXCP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F16_U16 class methods ---
+
+    Inst_VOP3__V_CVT_F16_U16::Inst_VOP3__V_CVT_F16_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f16_u16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CVT_F16_U16
+
+    Inst_VOP3__V_CVT_F16_U16::~Inst_VOP3__V_CVT_F16_U16()
+    {
+    } // ~Inst_VOP3__V_CVT_F16_U16
+
+    // --- description from .arch file ---
+    // D.f16 = uint16_to_flt16(S.u16).
+    // Supports denormals, rounding, exception flags and saturation.
+    void
+    Inst_VOP3__V_CVT_F16_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F16_I16 class methods ---
+
+    Inst_VOP3__V_CVT_F16_I16::Inst_VOP3__V_CVT_F16_I16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f16_i16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CVT_F16_I16
+
+    Inst_VOP3__V_CVT_F16_I16::~Inst_VOP3__V_CVT_F16_I16()
+    {
+    } // ~Inst_VOP3__V_CVT_F16_I16
+
+    // --- description from .arch file ---
+    // D.f16 = int16_to_flt16(S.i16).
+    // Supports denormals, rounding, exception flags and saturation.
+    void
+    Inst_VOP3__V_CVT_F16_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_U16_F16 class methods ---
+
+    Inst_VOP3__V_CVT_U16_F16::Inst_VOP3__V_CVT_U16_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_u16_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CVT_U16_F16
+
+    Inst_VOP3__V_CVT_U16_F16::~Inst_VOP3__V_CVT_U16_F16()
+    {
+    } // ~Inst_VOP3__V_CVT_U16_F16
+
+    // --- description from .arch file ---
+    // D.u16 = flt16_to_uint16(S.f16).
+    // Supports rounding, exception flags and saturation.
+    void
+    Inst_VOP3__V_CVT_U16_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_I16_F16 class methods ---
+
+    Inst_VOP3__V_CVT_I16_F16::Inst_VOP3__V_CVT_I16_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_i16_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CVT_I16_F16
+
+    Inst_VOP3__V_CVT_I16_F16::~Inst_VOP3__V_CVT_I16_F16()
+    {
+    } // ~Inst_VOP3__V_CVT_I16_F16
+
+    // --- description from .arch file ---
+    // D.i16 = flt16_to_int16(S.f16).
+    // Supports rounding, exception flags and saturation.
+    void
+    Inst_VOP3__V_CVT_I16_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_RCP_F16 class methods ---
+
+    Inst_VOP3__V_RCP_F16::Inst_VOP3__V_RCP_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rcp_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_RCP_F16
+
+    Inst_VOP3__V_RCP_F16::~Inst_VOP3__V_RCP_F16()
+    {
+    } // ~Inst_VOP3__V_RCP_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 1.0f)
+    //     D.f16 = 1.0f;
+    // else
+    //     D.f16 = ApproximateRecip(S0.f16).
+    void
+    Inst_VOP3__V_RCP_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_SQRT_F16 class methods ---
+
+    Inst_VOP3__V_SQRT_F16::Inst_VOP3__V_SQRT_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sqrt_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_SQRT_F16
+
+    Inst_VOP3__V_SQRT_F16::~Inst_VOP3__V_SQRT_F16()
+    {
+    } // ~Inst_VOP3__V_SQRT_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 1.0f)
+    //     D.f16 = 1.0f;
+    // else
+    //     D.f16 = ApproximateSqrt(S0.f16).
+    void
+    Inst_VOP3__V_SQRT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_RSQ_F16 class methods ---
+
+    Inst_VOP3__V_RSQ_F16::Inst_VOP3__V_RSQ_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rsq_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_RSQ_F16
+
+    Inst_VOP3__V_RSQ_F16::~Inst_VOP3__V_RSQ_F16()
+    {
+    } // ~Inst_VOP3__V_RSQ_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 1.0f)
+    //     D.f16 = 1.0f;
+    // else
+    //     D.f16 = ApproximateRecipSqrt(S0.f16).
+    void
+    Inst_VOP3__V_RSQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_LOG_F16 class methods ---
+
+    Inst_VOP3__V_LOG_F16::Inst_VOP3__V_LOG_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_log_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_LOG_F16
+
+    Inst_VOP3__V_LOG_F16::~Inst_VOP3__V_LOG_F16()
+    {
+    } // ~Inst_VOP3__V_LOG_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 1.0f)
+    //     D.f16 = 0.0f;
+    // else
+    //     D.f16 = ApproximateLog2(S0.f16).
+    void
+    Inst_VOP3__V_LOG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_EXP_F16 class methods ---
+
+    Inst_VOP3__V_EXP_F16::Inst_VOP3__V_EXP_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_exp_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_EXP_F16
+
+    Inst_VOP3__V_EXP_F16::~Inst_VOP3__V_EXP_F16()
+    {
+    } // ~Inst_VOP3__V_EXP_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 0.0f)
+    //     D.f16 = 1.0f;
+    // else
+    //     D.f16 = Approximate2ToX(S0.f16).
+    void
+    Inst_VOP3__V_EXP_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_FREXP_MANT_F16 class methods ---
+
+    Inst_VOP3__V_FREXP_MANT_F16::Inst_VOP3__V_FREXP_MANT_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_frexp_mant_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_FREXP_MANT_F16
+
+    Inst_VOP3__V_FREXP_MANT_F16::~Inst_VOP3__V_FREXP_MANT_F16()
+    {
+    } // ~Inst_VOP3__V_FREXP_MANT_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == +-INF || S0.f16 == NAN)
+    //     D.f16 = S0.f16;
+    // else
+    //     D.f16 = mantissa(S0.f16).
+    // Result range is (-1.0,-0.5][0.5,1.0).
+    // C math library frexp function.
+    // Returns binary significand of half precision float input, such that the
+    // original single float = significand * (2 ** exponent).
+    void
+    Inst_VOP3__V_FREXP_MANT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_FREXP_EXP_I16_F16 class methods ---
+
+    Inst_VOP3__V_FREXP_EXP_I16_F16::Inst_VOP3__V_FREXP_EXP_I16_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_frexp_exp_i16_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_FREXP_EXP_I16_F16
+
+    Inst_VOP3__V_FREXP_EXP_I16_F16::~Inst_VOP3__V_FREXP_EXP_I16_F16()
+    {
+    } // ~Inst_VOP3__V_FREXP_EXP_I16_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == +-INF || S0.f16 == NAN)
+    //     D.i16 = 0;
+    // else
+    //     D.i16 = 2s_complement(exponent(S0.f16) - 15 + 1).
+    // C math library frexp function.
+    // Returns exponent of half precision float input, such that the
+    // original single float = significand * (2 ** exponent).
+    void
+    Inst_VOP3__V_FREXP_EXP_I16_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_FLOOR_F16 class methods ---
+
+    Inst_VOP3__V_FLOOR_F16::Inst_VOP3__V_FLOOR_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_floor_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_FLOOR_F16
+
+    Inst_VOP3__V_FLOOR_F16::~Inst_VOP3__V_FLOOR_F16()
+    {
+    } // ~Inst_VOP3__V_FLOOR_F16
+
+    // --- description from .arch file ---
+    // D.f16 = trunc(S0.f16);
+    // if (S0.f16 < 0.0f && S0.f16 != D.f16) then D.f16 -= 1.0f.
+    void
+    Inst_VOP3__V_FLOOR_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CEIL_F16 class methods ---
+
+    Inst_VOP3__V_CEIL_F16::Inst_VOP3__V_CEIL_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ceil_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CEIL_F16
+
+    Inst_VOP3__V_CEIL_F16::~Inst_VOP3__V_CEIL_F16()
+    {
+    } // ~Inst_VOP3__V_CEIL_F16
+
+    // --- description from .arch file ---
+    // D.f16 = trunc(S0.f16);
+    // if (S0.f16 > 0.0f && S0.f16 != D.f16) then D.f16 += 1.0f.
+    void
+    Inst_VOP3__V_CEIL_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_TRUNC_F16 class methods ---
+
+    Inst_VOP3__V_TRUNC_F16::Inst_VOP3__V_TRUNC_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_trunc_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_TRUNC_F16
+
+    Inst_VOP3__V_TRUNC_F16::~Inst_VOP3__V_TRUNC_F16()
+    {
+    } // ~Inst_VOP3__V_TRUNC_F16
+
+    // --- description from .arch file ---
+    // D.f16 = trunc(S0.f16).
+    // Round-to-zero semantics.
+    void
+    Inst_VOP3__V_TRUNC_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_RNDNE_F16 class methods ---
+
+    Inst_VOP3__V_RNDNE_F16::Inst_VOP3__V_RNDNE_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rndne_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_RNDNE_F16
+
+    Inst_VOP3__V_RNDNE_F16::~Inst_VOP3__V_RNDNE_F16()
+    {
+    } // ~Inst_VOP3__V_RNDNE_F16
+
+    // --- description from .arch file ---
+    // D.f16 = FLOOR(S0.f16 + 0.5f);
+    // if (floor(S0.f16) is even && fract(S0.f16) == 0.5f) then D.f16 -= 1.0f.
+    // Round-to-nearest-even semantics.
+    void
+    Inst_VOP3__V_RNDNE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_FRACT_F16 class methods ---
+
+    Inst_VOP3__V_FRACT_F16::Inst_VOP3__V_FRACT_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_fract_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_FRACT_F16
+
+    Inst_VOP3__V_FRACT_F16::~Inst_VOP3__V_FRACT_F16()
+    {
+    } // ~Inst_VOP3__V_FRACT_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 + -floor(S0.f16).
+    void
+    Inst_VOP3__V_FRACT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_SIN_F16 class methods ---
+
+    Inst_VOP3__V_SIN_F16::Inst_VOP3__V_SIN_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sin_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_SIN_F16
+
+    Inst_VOP3__V_SIN_F16::~Inst_VOP3__V_SIN_F16()
+    {
+    } // ~Inst_VOP3__V_SIN_F16
+
+    // --- description from .arch file ---
+    // D.f16 = sin(S0.f16 * 2 * PI).
+    void
+    Inst_VOP3__V_SIN_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_COS_F16 class methods ---
+
+    Inst_VOP3__V_COS_F16::Inst_VOP3__V_COS_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cos_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_COS_F16
+
+    Inst_VOP3__V_COS_F16::~Inst_VOP3__V_COS_F16()
+    {
+    } // ~Inst_VOP3__V_COS_F16
+
+    // --- description from .arch file ---
+    // D.f16 = cos(S0.f16 * 2 * PI).
+    void
+    Inst_VOP3__V_COS_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_EXP_LEGACY_F32 class methods ---
+
+    Inst_VOP3__V_EXP_LEGACY_F32::Inst_VOP3__V_EXP_LEGACY_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_exp_legacy_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_EXP_LEGACY_F32
+
+    Inst_VOP3__V_EXP_LEGACY_F32::~Inst_VOP3__V_EXP_LEGACY_F32()
+    {
+    } // ~Inst_VOP3__V_EXP_LEGACY_F32
+
+    // --- description from .arch file ---
+    // D.f = pow(2.0, S0.f) with legacy semantics.
+    void
+    Inst_VOP3__V_EXP_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::pow(2.0, src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LOG_LEGACY_F32 class methods ---
+
+    Inst_VOP3__V_LOG_LEGACY_F32::Inst_VOP3__V_LOG_LEGACY_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_log_legacy_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_LOG_LEGACY_F32
+
+    Inst_VOP3__V_LOG_LEGACY_F32::~Inst_VOP3__V_LOG_LEGACY_F32()
+    {
+    } // ~Inst_VOP3__V_LOG_LEGACY_F32
+
+    // --- description from .arch file ---
+    // D.f = log2(S0.f). Base 2 logarithm with legacy semantics.
+    void
+    Inst_VOP3__V_LOG_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::log2(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAD_LEGACY_F32 class methods ---
+
+    Inst_VOP3__V_MAD_LEGACY_F32::Inst_VOP3__V_MAD_LEGACY_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mad_legacy_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(MAD);
+    } // Inst_VOP3__V_MAD_LEGACY_F32
+
+    Inst_VOP3__V_MAD_LEGACY_F32::~Inst_VOP3__V_MAD_LEGACY_F32()
+    {
+    } // ~Inst_VOP3__V_MAD_LEGACY_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f + S2.f (DX9 rules, 0.0 * x = 0.0).
+    void
+    Inst_VOP3__V_MAD_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAD_F32 class methods ---
+
+    Inst_VOP3__V_MAD_F32::Inst_VOP3__V_MAD_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mad_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(MAD);
+    } // Inst_VOP3__V_MAD_F32
+
+    Inst_VOP3__V_MAD_F32::~Inst_VOP3__V_MAD_F32()
+    {
+    } // ~Inst_VOP3__V_MAD_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f + S2.f.
+    void
+    Inst_VOP3__V_MAD_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAD_I32_I24 class methods ---
+
+    Inst_VOP3__V_MAD_I32_I24::Inst_VOP3__V_MAD_I32_I24(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mad_i32_i24", false)
+    {
+        setFlag(ALU);
+        setFlag(MAD);
+    } // Inst_VOP3__V_MAD_I32_I24
+
+    Inst_VOP3__V_MAD_I32_I24::~Inst_VOP3__V_MAD_I32_I24()
+    {
+    } // ~Inst_VOP3__V_MAD_I32_I24
+
+    // --- description from .arch file ---
+    // D.i = S0.i[23:0] * S1.i[23:0] + S2.i.
+    void
+    Inst_VOP3__V_MAD_I32_I24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
+                    * sext<24>(bits(src1[lane], 23, 0)) + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAD_U32_U24 class methods ---
+
+    Inst_VOP3__V_MAD_U32_U24::Inst_VOP3__V_MAD_U32_U24(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mad_u32_u24", false)
+    {
+        setFlag(ALU);
+        setFlag(MAD);
+    } // Inst_VOP3__V_MAD_U32_U24
+
+    Inst_VOP3__V_MAD_U32_U24::~Inst_VOP3__V_MAD_U32_U24()
+    {
+    } // ~Inst_VOP3__V_MAD_U32_U24
+
+    // --- description from .arch file ---
+    // D.u = S0.u[23:0] * S1.u[23:0] + S2.u.
+    void
+    Inst_VOP3__V_MAD_U32_U24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0)
+                    + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CUBEID_F32 class methods ---
+
+    Inst_VOP3__V_CUBEID_F32::Inst_VOP3__V_CUBEID_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cubeid_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CUBEID_F32
+
+    Inst_VOP3__V_CUBEID_F32::~Inst_VOP3__V_CUBEID_F32()
+    {
+    } // ~Inst_VOP3__V_CUBEID_F32
+
+    // --- description from .arch file ---
+    // D.f = cubemap face ID ({0.0, 1.0, ..., 5.0}). XYZ coordinate is given in
+    // ---  (S0.f, S1.f, S2.f).
+    void
+    Inst_VOP3__V_CUBEID_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CUBESC_F32 class methods ---
+
+    Inst_VOP3__V_CUBESC_F32::Inst_VOP3__V_CUBESC_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cubesc_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CUBESC_F32
+
+    Inst_VOP3__V_CUBESC_F32::~Inst_VOP3__V_CUBESC_F32()
+    {
+    } // ~Inst_VOP3__V_CUBESC_F32
+
+    // --- description from .arch file ---
+    // D.f = cubemap S coordinate. XYZ coordinate is given in (S0.f, S1.f,
+    // S2.f).
+    void
+    Inst_VOP3__V_CUBESC_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CUBETC_F32 class methods ---
+
+    Inst_VOP3__V_CUBETC_F32::Inst_VOP3__V_CUBETC_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cubetc_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CUBETC_F32
+
+    Inst_VOP3__V_CUBETC_F32::~Inst_VOP3__V_CUBETC_F32()
+    {
+    } // ~Inst_VOP3__V_CUBETC_F32
+
+    // --- description from .arch file ---
+    // D.f = cubemap T coordinate. XYZ coordinate is given in (S0.f, S1.f,
+    // S2.f).
+    void
+    Inst_VOP3__V_CUBETC_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CUBEMA_F32 class methods ---
+
+    Inst_VOP3__V_CUBEMA_F32::Inst_VOP3__V_CUBEMA_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cubema_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CUBEMA_F32
+
+    Inst_VOP3__V_CUBEMA_F32::~Inst_VOP3__V_CUBEMA_F32()
+    {
+    } // ~Inst_VOP3__V_CUBEMA_F32
+
+    // --- description from .arch file ---
+    // D.f = 2.0 * cubemap major axis. XYZ coordinate is given in (S0.f, S1.f,
+    // ---  S2.f).
+    void
+    Inst_VOP3__V_CUBEMA_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_BFE_U32 class methods ---
+
+    Inst_VOP3__V_BFE_U32::Inst_VOP3__V_BFE_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_bfe_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_BFE_U32
+
+    Inst_VOP3__V_BFE_U32::~Inst_VOP3__V_BFE_U32()
+    {
+    } // ~Inst_VOP3__V_BFE_U32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u>>S1.u[4:0]) & ((1<<S2.u[4:0])-1).
+    // Bitfield extract with S0 = data, S1 = field_offset, S2 = field_width.
+    void
+    Inst_VOP3__V_BFE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (src0[lane] >> bits(src1[lane], 4, 0))
+                    & ((1 << bits(src2[lane], 4, 0)) - 1);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_BFE_I32 class methods ---
+
+    Inst_VOP3__V_BFE_I32::Inst_VOP3__V_BFE_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_bfe_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_BFE_I32
+
+    Inst_VOP3__V_BFE_I32::~Inst_VOP3__V_BFE_I32()
+    {
+    } // ~Inst_VOP3__V_BFE_I32
+
+    // --- description from .arch file ---
+    // D.i = (S0.i>>S1.u[4:0]) & ((1<<S2.u[4:0])-1).
+    // Bitfield extract with S0 = data, S1 = field_offset, S2 = field_width.
+    void
+    Inst_VOP3__V_BFE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (src0[lane] >> bits(src1[lane], 4, 0))
+                    & ((1 << bits(src2[lane], 4, 0)) - 1);
+
+                // Above extracted a signed int of size src2 bits which needs
+                // to be signed-extended. Check if the MSB of our src2-bit
+                // integer is 1, and sign extend it is.
+                if (vdst[lane] >> (bits(src2[lane], 4, 0) - 1)) {
+                    vdst[lane] |= 0xffffffff << bits(src2[lane], 4, 0);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_BFI_B32 class methods ---
+
+    Inst_VOP3__V_BFI_B32::Inst_VOP3__V_BFI_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_bfi_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_BFI_B32
+
+    Inst_VOP3__V_BFI_B32::~Inst_VOP3__V_BFI_B32()
+    {
+    } // ~Inst_VOP3__V_BFI_B32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u & S1.u) | (~S0.u & S2.u); bitfield insert.
+    void
+    Inst_VOP3__V_BFI_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (src0[lane] & src1[lane]) | (~src0[lane]
+                    & src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FMA_F32 class methods ---
+
+    Inst_VOP3__V_FMA_F32::Inst_VOP3__V_FMA_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_fma_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(FMA);
+    } // Inst_VOP3__V_FMA_F32
+
+    Inst_VOP3__V_FMA_F32::~Inst_VOP3__V_FMA_F32()
+    {
+    } // ~Inst_VOP3__V_FMA_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f + S2.f.
+    void
+    Inst_VOP3__V_FMA_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FMA_F64 class methods ---
+
+    Inst_VOP3__V_FMA_F64::Inst_VOP3__V_FMA_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_fma_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(FMA);
+    } // Inst_VOP3__V_FMA_F64
+
+    Inst_VOP3__V_FMA_F64::~Inst_VOP3__V_FMA_F64()
+    {
+    } // ~Inst_VOP3__V_FMA_F64
+
+    // --- description from .arch file ---
+    // D.d = S0.d * S1.d + S2.d.
+    void
+    Inst_VOP3__V_FMA_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LERP_U8 class methods ---
+
+    Inst_VOP3__V_LERP_U8::Inst_VOP3__V_LERP_U8(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lerp_u8", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LERP_U8
+
+    Inst_VOP3__V_LERP_U8::~Inst_VOP3__V_LERP_U8()
+    {
+    } // ~Inst_VOP3__V_LERP_U8
+
+    // --- description from .arch file ---
+    // D.u = ((S0.u[31:24] + S1.u[31:24] + S2.u[24]) >> 1) << 24
+    // D.u += ((S0.u[23:16] + S1.u[23:16] + S2.u[16]) >> 1) << 16;
+    // D.u += ((S0.u[15:8] + S1.u[15:8] + S2.u[8]) >> 1) << 8;
+    // D.u += ((S0.u[7:0] + S1.u[7:0] + S2.u[0]) >> 1).
+    // Unsigned 8-bit pixel average on packed unsigned bytes (linear
+    // ---  interpolation). S2 acts as a round mode; if set, 0.5 rounds up,
+    // ---  otherwise 0.5 truncates.
+    void
+    Inst_VOP3__V_LERP_U8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = ((bits(src0[lane], 31, 24)
+                    + bits(src1[lane], 31, 24) + bits(src2[lane], 24)) >> 1)
+                        << 24;
+                vdst[lane] += ((bits(src0[lane], 23, 16)
+                    + bits(src1[lane], 23, 16) + bits(src2[lane], 16)) >> 1)
+                        << 16;
+                vdst[lane] += ((bits(src0[lane], 15, 8)
+                    + bits(src1[lane], 15, 8) + bits(src2[lane], 8)) >> 1)
+                        << 8;
+                vdst[lane] += ((bits(src0[lane], 7, 0) + bits(src1[lane], 7, 0)
+                    + bits(src2[lane], 0)) >> 1);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ALIGNBIT_B32 class methods ---
+
+    Inst_VOP3__V_ALIGNBIT_B32::Inst_VOP3__V_ALIGNBIT_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_alignbit_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_ALIGNBIT_B32
+
+    Inst_VOP3__V_ALIGNBIT_B32::~Inst_VOP3__V_ALIGNBIT_B32()
+    {
+    } // ~Inst_VOP3__V_ALIGNBIT_B32
+
+    // --- description from .arch file ---
+    // D.u = ({S0,S1} >> S2.u[4:0]) & 0xffffffff.
+    void
+    Inst_VOP3__V_ALIGNBIT_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemU64 src_0_1 = (((VecElemU64)src0[lane] << 32)
+                    | (VecElemU64)src1[lane]);
+                vdst[lane] = (VecElemU32)((src_0_1
+                    >> (VecElemU64)bits(src2[lane], 4, 0)) & 0xffffffff);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ALIGNBYTE_B32 class methods ---
+
+    Inst_VOP3__V_ALIGNBYTE_B32::Inst_VOP3__V_ALIGNBYTE_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_alignbyte_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_ALIGNBYTE_B32
+
+    Inst_VOP3__V_ALIGNBYTE_B32::~Inst_VOP3__V_ALIGNBYTE_B32()
+    {
+    } // ~Inst_VOP3__V_ALIGNBYTE_B32
+
+    // --- description from .arch file ---
+    // D.u = ({S0,S1} >> (8*S2.u[4:0])) & 0xffffffff.
+    void
+    Inst_VOP3__V_ALIGNBYTE_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemU64 src_0_1 = (((VecElemU64)src0[lane] << 32)
+                    | (VecElemU64)src1[lane]);
+                vdst[lane] = (VecElemU32)((src_0_1
+                    >> (8ULL * (VecElemU64)bits(src2[lane], 4, 0)))
+                        & 0xffffffff);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MIN3_F32 class methods ---
+
+    Inst_VOP3__V_MIN3_F32::Inst_VOP3__V_MIN3_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min3_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_MIN3_F32
+
+    Inst_VOP3__V_MIN3_F32::~Inst_VOP3__V_MIN3_F32()
+    {
+    } // ~Inst_VOP3__V_MIN3_F32
+
+    // --- description from .arch file ---
+    // D.f = min(S0.f, S1.f, S2.f).
+    void
+    Inst_VOP3__V_MIN3_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemF32 min_0_1 = std::fmin(src0[lane], src1[lane]);
+                vdst[lane] = std::fmin(min_0_1, src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MIN3_I32 class methods ---
+
+    Inst_VOP3__V_MIN3_I32::Inst_VOP3__V_MIN3_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min3_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MIN3_I32
+
+    Inst_VOP3__V_MIN3_I32::~Inst_VOP3__V_MIN3_I32()
+    {
+    } // ~Inst_VOP3__V_MIN3_I32
+
+    // --- description from .arch file ---
+    // D.i = min(S0.i, S1.i, S2.i).
+    void
+    Inst_VOP3__V_MIN3_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemI32 min_0_1 = std::min(src0[lane], src1[lane]);
+                vdst[lane] = std::min(min_0_1, src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MIN3_U32 class methods ---
+
+    Inst_VOP3__V_MIN3_U32::Inst_VOP3__V_MIN3_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min3_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MIN3_U32
+
+    Inst_VOP3__V_MIN3_U32::~Inst_VOP3__V_MIN3_U32()
+    {
+    } // ~Inst_VOP3__V_MIN3_U32
+
+    // --- description from .arch file ---
+    // D.u = min(S0.u, S1.u, S2.u).
+    void
+    Inst_VOP3__V_MIN3_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemU32 min_0_1 = std::min(src0[lane], src1[lane]);
+                vdst[lane] = std::min(min_0_1, src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAX3_F32 class methods ---
+
+    Inst_VOP3__V_MAX3_F32::Inst_VOP3__V_MAX3_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max3_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_MAX3_F32
+
+    Inst_VOP3__V_MAX3_F32::~Inst_VOP3__V_MAX3_F32()
+    {
+    } // ~Inst_VOP3__V_MAX3_F32
+
+    // --- description from .arch file ---
+    // D.f = max(S0.f, S1.f, S2.f).
+    void
+    Inst_VOP3__V_MAX3_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemF32 max_0_1 = std::fmax(src0[lane], src1[lane]);
+                vdst[lane] = std::fmax(max_0_1, src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAX3_I32 class methods ---
+
+    Inst_VOP3__V_MAX3_I32::Inst_VOP3__V_MAX3_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max3_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MAX3_I32
+
+    Inst_VOP3__V_MAX3_I32::~Inst_VOP3__V_MAX3_I32()
+    {
+    } // ~Inst_VOP3__V_MAX3_I32
+
+    // --- description from .arch file ---
+    // D.i = max(S0.i, S1.i, S2.i).
+    void
+    Inst_VOP3__V_MAX3_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemI32 max_0_1 = std::max(src0[lane], src1[lane]);
+                vdst[lane] = std::max(max_0_1, src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAX3_U32 class methods ---
+
+    Inst_VOP3__V_MAX3_U32::Inst_VOP3__V_MAX3_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max3_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MAX3_U32
+
+    Inst_VOP3__V_MAX3_U32::~Inst_VOP3__V_MAX3_U32()
+    {
+    } // ~Inst_VOP3__V_MAX3_U32
+
+    // --- description from .arch file ---
+    // D.u = max(S0.u, S1.u, S2.u).
+    void
+    Inst_VOP3__V_MAX3_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemU32 max_0_1 = std::max(src0[lane], src1[lane]);
+                vdst[lane] = std::max(max_0_1, src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MED3_F32 class methods ---
+
+    Inst_VOP3__V_MED3_F32::Inst_VOP3__V_MED3_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_med3_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_MED3_F32
+
+    Inst_VOP3__V_MED3_F32::~Inst_VOP3__V_MED3_F32()
+    {
+    } // ~Inst_VOP3__V_MED3_F32
+
+    // --- description from .arch file ---
+    // D.f = median(S0.f, S1.f, S2.f).
+    void
+    Inst_VOP3__V_MED3_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MED3_I32 class methods ---
+
+    Inst_VOP3__V_MED3_I32::Inst_VOP3__V_MED3_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_med3_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MED3_I32
+
+    Inst_VOP3__V_MED3_I32::~Inst_VOP3__V_MED3_I32()
+    {
+    } // ~Inst_VOP3__V_MED3_I32
+
+    // --- description from .arch file ---
+    // D.i = median(S0.i, S1.i, S2.i).
+    void
+    Inst_VOP3__V_MED3_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MED3_U32 class methods ---
+
+    Inst_VOP3__V_MED3_U32::Inst_VOP3__V_MED3_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_med3_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MED3_U32
+
+    Inst_VOP3__V_MED3_U32::~Inst_VOP3__V_MED3_U32()
+    {
+    } // ~Inst_VOP3__V_MED3_U32
+
+    // --- description from .arch file ---
+    // D.u = median(S0.u, S1.u, S2.u).
+    void
+    Inst_VOP3__V_MED3_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SAD_U8 class methods ---
+
+    Inst_VOP3__V_SAD_U8::Inst_VOP3__V_SAD_U8(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sad_u8", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_SAD_U8
+
+    Inst_VOP3__V_SAD_U8::~Inst_VOP3__V_SAD_U8()
+    {
+    } // ~Inst_VOP3__V_SAD_U8
+
+    // --- description from .arch file ---
+    // D.u = abs(S0.i[31:24] - S1.i[31:24]) + abs(S0.i[23:16] - S1.i[23:16]) +
+    // abs(S0.i[15:8] - S1.i[15:8]) + abs(S0.i[7:0] - S1.i[7:0]) + S2.u.
+    // Sum of absolute differences with accumulation, overflow into upper bits
+    // is allowed.
+    void
+    Inst_VOP3__V_SAD_U8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::abs(bits(src0[lane], 31, 24)
+                    - bits(src1[lane], 31, 24))
+                    + std::abs(bits(src0[lane], 23, 16)
+                    - bits(src1[lane], 23, 16))
+                    + std::abs(bits(src0[lane], 15, 8)
+                    - bits(src1[lane], 15, 8))
+                    + std::abs(bits(src0[lane], 7, 0)
+                    - bits(src1[lane], 7, 0)) + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SAD_HI_U8 class methods ---
+
+    Inst_VOP3__V_SAD_HI_U8::Inst_VOP3__V_SAD_HI_U8(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sad_hi_u8", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_SAD_HI_U8
+
+    Inst_VOP3__V_SAD_HI_U8::~Inst_VOP3__V_SAD_HI_U8()
+    {
+    } // ~Inst_VOP3__V_SAD_HI_U8
+
+    // --- description from .arch file ---
+    // D.u = (SAD_U8(S0, S1, 0) << 16) + S2.u.
+    // Sum of absolute differences with accumulation, overflow is lost.
+    void
+    Inst_VOP3__V_SAD_HI_U8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (((bits(src0[lane], 31, 24)
+                    - bits(src1[lane], 31, 24)) + (bits(src0[lane], 23, 16)
+                    - bits(src1[lane], 23, 16)) + (bits(src0[lane], 15, 8)
+                    - bits(src1[lane], 15, 8)) + (bits(src0[lane], 7, 0)
+                    - bits(src1[lane], 7, 0))) << 16) + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SAD_U16 class methods ---
+
+    Inst_VOP3__V_SAD_U16::Inst_VOP3__V_SAD_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sad_u16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_SAD_U16
+
+    Inst_VOP3__V_SAD_U16::~Inst_VOP3__V_SAD_U16()
+    {
+    } // ~Inst_VOP3__V_SAD_U16
+
+    // --- description from .arch file ---
+    // D.u = abs(S0.i[31:16] - S1.i[31:16]) + abs(S0.i[15:0] - S1.i[15:0])
+    // + S2.u.
+    // Word SAD with accumulation.
+    void
+    Inst_VOP3__V_SAD_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::abs(bits(src0[lane], 31, 16)
+                    - bits(src1[lane], 31, 16))
+                    + std::abs(bits(src0[lane], 15, 0)
+                    - bits(src1[lane], 15, 0)) + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SAD_U32 class methods ---
+
+    Inst_VOP3__V_SAD_U32::Inst_VOP3__V_SAD_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sad_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_SAD_U32
+
+    Inst_VOP3__V_SAD_U32::~Inst_VOP3__V_SAD_U32()
+    {
+    } // ~Inst_VOP3__V_SAD_U32
+
+    // --- description from .arch file ---
+    // D.u = abs(S0.i - S1.i) + S2.u.
+    // Dword SAD with accumulation.
+    void
+    Inst_VOP3__V_SAD_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::abs(src0[lane] - src1[lane]) + src2[lane];
+            } // if
+        } // for
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_PK_U8_F32 class methods ---
+
+    Inst_VOP3__V_CVT_PK_U8_F32::Inst_VOP3__V_CVT_PK_U8_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_pk_u8_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_PK_U8_F32
+
+    Inst_VOP3__V_CVT_PK_U8_F32::~Inst_VOP3__V_CVT_PK_U8_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_PK_U8_F32
+
+    // --- description from .arch file ---
+    // D.u = ((flt32_to_uint8(S0.f) & 0xff) << (8 * S1.u[1:0]))
+    // | (S2.u & ~(0xff << (8 * S1.u[1:0]))).
+    // Convert floating point value S0 to 8-bit unsigned integer and pack the
+    // result into byte S1 of dword S2.
+    void
+    Inst_VOP3__V_CVT_PK_U8_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (((VecElemU8)src0[lane] & 0xff)
+                    << (8 * bits(src1[lane], 1, 0)))
+                    | (src2[lane] & ~(0xff << (8 * bits(src1[lane], 1, 0))));
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_DIV_FIXUP_F32 class methods ---
+
+    Inst_VOP3__V_DIV_FIXUP_F32::Inst_VOP3__V_DIV_FIXUP_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_div_fixup_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_DIV_FIXUP_F32
+
+    Inst_VOP3__V_DIV_FIXUP_F32::~Inst_VOP3__V_DIV_FIXUP_F32()
+    {
+    } // ~Inst_VOP3__V_DIV_FIXUP_F32
+
+    // --- description from .arch file ---
+    // D.f = Divide fixup and flags -- s0.f = Quotient, s1.f = Denominator,
+    // s2.f = Numerator. This opcode generates exceptions resulting from the
+    // division operation.
+    void
+    Inst_VOP3__V_DIV_FIXUP_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::fpclassify(src1[lane]) == FP_ZERO) {
+                    if (std::signbit(src1[lane])) {
+                        vdst[lane] = -INFINITY;
+                    } else {
+                        vdst[lane] = +INFINITY;
+                    }
+                } else if (std::isnan(src2[lane]) || std::isnan(src1[lane])) {
+                    vdst[lane] = NAN;
+                } else if (std::isinf(src1[lane])) {
+                    if (std::signbit(src1[lane])) {
+                        vdst[lane] = -INFINITY;
+                    } else {
+                        vdst[lane] = +INFINITY;
+                    }
+                } else {
+                    vdst[lane] = src2[lane] / src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_DIV_FIXUP_F64 class methods ---
+
+    Inst_VOP3__V_DIV_FIXUP_F64::Inst_VOP3__V_DIV_FIXUP_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_div_fixup_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_DIV_FIXUP_F64
+
+    Inst_VOP3__V_DIV_FIXUP_F64::~Inst_VOP3__V_DIV_FIXUP_F64()
+    {
+    } // ~Inst_VOP3__V_DIV_FIXUP_F64
+
+    // --- description from .arch file ---
+    // D.d = Divide fixup and flags -- s0.d = Quotient, s1.d = Denominator,
+    // s2.d = Numerator. This opcode generates exceptions resulting from the
+    // division operation.
+    void
+    Inst_VOP3__V_DIV_FIXUP_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int sign_out = std::signbit(src1[lane])
+                              ^ std::signbit(src2[lane]);
+                int exp1(0);
+                int exp2(0);
+                std::frexp(src1[lane], &exp1);
+                std::frexp(src2[lane], &exp2);
+
+                if (std::isnan(src1[lane]) || std::isnan(src2[lane])) {
+                    vdst[lane] = std::numeric_limits<VecElemF64>::quiet_NaN();
+                } else if (std::fpclassify(src1[lane]) == FP_ZERO
+                           && std::fpclassify(src2[lane]) == FP_ZERO) {
+                    vdst[lane]
+                        = std::numeric_limits<VecElemF64>::signaling_NaN();
+                } else if (std::isinf(src1[lane]) && std::isinf(src2[lane])) {
+                    vdst[lane]
+                        = std::numeric_limits<VecElemF64>::signaling_NaN();
+                } else if (std::fpclassify(src1[lane]) == FP_ZERO
+                           || std::isinf(src2[lane])) {
+                    vdst[lane] = sign_out ? -INFINITY : +INFINITY;
+                } else if (std::isinf(src1[lane])
+                           || std::fpclassify(src2[lane]) == FP_ZERO) {
+                    vdst[lane] = sign_out ? -0.0 : +0.0;
+                } else if (exp2 - exp1 < -1075) {
+                    vdst[lane] = src0[lane];
+                } else if (exp1 == 2047) {
+                    vdst[lane] = src0[lane];
+                } else {
+                    vdst[lane] = sign_out ? -std::fabs(src0[lane])
+                        : std::fabs(src0[lane]);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_DIV_SCALE_F32 class methods ---
+
+    Inst_VOP3__V_DIV_SCALE_F32::Inst_VOP3__V_DIV_SCALE_F32(
+          InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_div_scale_f32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(F32);
+    } // Inst_VOP3__V_DIV_SCALE_F32
+
+    Inst_VOP3__V_DIV_SCALE_F32::~Inst_VOP3__V_DIV_SCALE_F32()
+    {
+    } // ~Inst_VOP3__V_DIV_SCALE_F32
+
+    // --- description from .arch file ---
+    // {vcc,D.f} = Divide preop and flags -- s0.f = Quotient, s1.f =
+    // Denominator, s2.f = Numerator -- s0 must equal s1 or s2. Given a
+    // numerator and denominator, this opcode will appropriately scale inputs
+    // for division to avoid subnormal terms during Newton-Raphson correction
+    // algorithm. This opcode producses a VCC flag for post-scale of quotient.
+    void
+    Inst_VOP3__V_DIV_SCALE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane];
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_DIV_SCALE_F64 class methods ---
+
+    Inst_VOP3__V_DIV_SCALE_F64::Inst_VOP3__V_DIV_SCALE_F64(
+          InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_div_scale_f64")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(F64);
+    } // Inst_VOP3__V_DIV_SCALE_F64
+
+    Inst_VOP3__V_DIV_SCALE_F64::~Inst_VOP3__V_DIV_SCALE_F64()
+    {
+    } // ~Inst_VOP3__V_DIV_SCALE_F64
+
+    // --- description from .arch file ---
+    // {vcc,D.d} = Divide preop and flags -- s0.d = Quotient, s1.d =
+    // Denominator, s2.d = Numerator -- s0 must equal s1 or s2. Given a
+    // numerator and denominator, this opcode will appropriately scale inputs
+    // for division to avoid subnormal terms during Newton-Raphson correction
+    // algorithm. This opcode producses a VCC flag for post-scale of quotient.
+    void
+    Inst_VOP3__V_DIV_SCALE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
+        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int exp1(0);
+                int exp2(0);
+                std::frexp(src1[lane], &exp1);
+                std::frexp(src2[lane], &exp2);
+                vcc.setBit(lane, 0);
+
+                if (std::fpclassify(src1[lane]) == FP_ZERO
+                    || std::fpclassify(src2[lane]) == FP_ZERO) {
+                    vdst[lane] = NAN;
+                } else if (exp2 - exp1 >= 768) {
+                    vcc.setBit(lane, 1);
+                    if (src0[lane] == src1[lane]) {
+                        vdst[lane] = std::ldexp(src0[lane], 128);
+                    }
+                } else if (std::fpclassify(src1[lane]) == FP_SUBNORMAL) {
+                    vdst[lane] = std::ldexp(src0[lane], 128);
+                } else if (std::fpclassify(1.0 / src1[lane]) == FP_SUBNORMAL
+                           && std::fpclassify(src2[lane] / src1[lane])
+                           == FP_SUBNORMAL) {
+                    vcc.setBit(lane, 1);
+                    if (src0[lane] == src1[lane]) {
+                        vdst[lane] = std::ldexp(src0[lane], 128);
+                    }
+                } else if (std::fpclassify(1.0 / src1[lane]) == FP_SUBNORMAL) {
+                    vdst[lane] = std::ldexp(src0[lane], -128);
+                } else if (std::fpclassify(src2[lane] / src1[lane])
+                           == FP_SUBNORMAL) {
+                    vcc.setBit(lane, 1);
+                    if (src0[lane] == src2[lane]) {
+                        vdst[lane] = std::ldexp(src0[lane], 128);
+                    }
+                } else if (exp2 <= 53) {
+                    vdst[lane] = std::ldexp(src0[lane], 128);
+                }
+            }
+        }
+
+        vcc.write();
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_DIV_FMAS_F32 class methods ---
+
+    Inst_VOP3__V_DIV_FMAS_F32::Inst_VOP3__V_DIV_FMAS_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_div_fmas_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(ReadsVCC);
+        setFlag(F32);
+        setFlag(FMA);
+    } // Inst_VOP3__V_DIV_FMAS_F32
+
+    Inst_VOP3__V_DIV_FMAS_F32::~Inst_VOP3__V_DIV_FMAS_F32()
+    {
+    } // ~Inst_VOP3__V_DIV_FMAS_F32
+
+    // --- description from .arch file ---
+    // D.f = Special case divide FMA with scale and flags(s0.f = Quotient,
+    // s1.f = Denominator, s2.f = Numerator)
+    void
+    Inst_VOP3__V_DIV_FMAS_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
+            }
+        }
+
+        //vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_DIV_FMAS_F64 class methods ---
+
+    Inst_VOP3__V_DIV_FMAS_F64::Inst_VOP3__V_DIV_FMAS_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_div_fmas_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(ReadsVCC);
+        setFlag(F64);
+        setFlag(FMA);
+    } // Inst_VOP3__V_DIV_FMAS_F64
+
+    Inst_VOP3__V_DIV_FMAS_F64::~Inst_VOP3__V_DIV_FMAS_F64()
+    {
+    } // ~Inst_VOP3__V_DIV_FMAS_F64
+
+    // --- description from .arch file ---
+    // D.d = Special case divide FMA with scale and flags(s0.d = Quotient,
+    // s1.d = Denominator, s2.d = Numerator)
+    void
+    Inst_VOP3__V_DIV_FMAS_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+        vcc.read();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (bits(vcc.rawData(), lane)) {
+                    vdst[lane] = std::pow(2, 64)
+                        * std::fma(src0[lane], src1[lane], src2[lane]);
+                } else {
+                    vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MSAD_U8 class methods ---
+
+    Inst_VOP3__V_MSAD_U8::Inst_VOP3__V_MSAD_U8(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_msad_u8", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MSAD_U8
+
+    Inst_VOP3__V_MSAD_U8::~Inst_VOP3__V_MSAD_U8()
+    {
+    } // ~Inst_VOP3__V_MSAD_U8
+
+    // --- description from .arch file ---
+    // D.u = Masked Byte SAD with accum_lo(S0.u, S1.u, S2.u).
+    void
+    Inst_VOP3__V_MSAD_U8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_QSAD_PK_U16_U8 class methods ---
+
+    Inst_VOP3__V_QSAD_PK_U16_U8::Inst_VOP3__V_QSAD_PK_U16_U8(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_qsad_pk_u16_u8", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_QSAD_PK_U16_U8
+
+    Inst_VOP3__V_QSAD_PK_U16_U8::~Inst_VOP3__V_QSAD_PK_U16_U8()
+    {
+    } // ~Inst_VOP3__V_QSAD_PK_U16_U8
+
+    // --- description from .arch file ---
+    // D.u = Quad-Byte SAD with 16-bit packed accum_lo/hi(S0.u[63:0],
+    // S1.u[31:0], S2.u[63:0])
+    void
+    Inst_VOP3__V_QSAD_PK_U16_U8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_MQSAD_PK_U16_U8 class methods ---
+
+    Inst_VOP3__V_MQSAD_PK_U16_U8::Inst_VOP3__V_MQSAD_PK_U16_U8(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mqsad_pk_u16_u8", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MQSAD_PK_U16_U8
+
+    Inst_VOP3__V_MQSAD_PK_U16_U8::~Inst_VOP3__V_MQSAD_PK_U16_U8()
+    {
+    } // ~Inst_VOP3__V_MQSAD_PK_U16_U8
+
+    // --- description from .arch file ---
+    // D.u = Masked Quad-Byte SAD with 16-bit packed accum_lo/hi(S0.u[63:0],
+    // ---  S1.u[31:0], S2.u[63:0])
+    void
+    Inst_VOP3__V_MQSAD_PK_U16_U8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_MQSAD_U32_U8 class methods ---
+
+    Inst_VOP3__V_MQSAD_U32_U8::Inst_VOP3__V_MQSAD_U32_U8(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mqsad_u32_u8", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MQSAD_U32_U8
+
+    Inst_VOP3__V_MQSAD_U32_U8::~Inst_VOP3__V_MQSAD_U32_U8()
+    {
+    } // ~Inst_VOP3__V_MQSAD_U32_U8
+
+    // --- description from .arch file ---
+    // D.u128 = Masked Quad-Byte SAD with 32-bit accum_lo/hi(S0.u[63:0],
+    // ---  S1.u[31:0], S2.u[127:0])
+    void
+    Inst_VOP3__V_MQSAD_U32_U8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_MAD_U64_U32 class methods ---
+
+    Inst_VOP3__V_MAD_U64_U32::Inst_VOP3__V_MAD_U64_U32(
+          InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_mad_u64_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(MAD);
+    } // Inst_VOP3__V_MAD_U64_U32
+
+    Inst_VOP3__V_MAD_U64_U32::~Inst_VOP3__V_MAD_U64_U32()
+    {
+    } // ~Inst_VOP3__V_MAD_U64_U32
+
+    // --- description from .arch file ---
+    // {vcc_out,D.u64} = S0.u32 * S1.u32 + S2.u64.
+    void
+    Inst_VOP3__V_MAD_U64_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
+        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
+        VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+        vdst.read();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, muladd(vdst[lane], src0[lane], src1[lane],
+                    src2[lane]));
+            }
+        }
+
+        vcc.write();
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAD_I64_I32 class methods ---
+
+    Inst_VOP3__V_MAD_I64_I32::Inst_VOP3__V_MAD_I64_I32(
+          InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_mad_i64_i32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(MAD);
+    } // Inst_VOP3__V_MAD_I64_I32
+
+    Inst_VOP3__V_MAD_I64_I32::~Inst_VOP3__V_MAD_I64_I32()
+    {
+    } // ~Inst_VOP3__V_MAD_I64_I32
+
+    // --- description from .arch file ---
+    // {vcc_out,D.i64} = S0.i32 * S1.i32 + S2.i64.
+    void
+    Inst_VOP3__V_MAD_I64_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandI64 src2(gpuDynInst, extData.SRC2);
+        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
+        VecOperandI64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, muladd(vdst[lane], src0[lane], src1[lane],
+                    src2[lane]));
+            }
+        }
+
+        vcc.write();
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_XAD_U32 class methods ---
+
+    Inst_VOP3__V_XAD_U32::Inst_VOP3__V_XAD_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_xad_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_XAD_U32
+
+    Inst_VOP3__V_XAD_U32::~Inst_VOP3__V_XAD_U32()
+    {
+    } // ~Inst_VOP3__V_XAD_U32
+
+    // --- description from .arch file ---
+    // D.u32 = (S0.u32 ^ S1.u32) + S2.u32.
+    void
+    Inst_VOP3__V_XAD_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (src0[lane] ^ src1[lane]) + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LSHL_ADD_U32 class methods ---
+
+    Inst_VOP3__V_LSHL_ADD_U32::Inst_VOP3__V_LSHL_ADD_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lshl_add_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LSHL_ADD_U32
+
+    Inst_VOP3__V_LSHL_ADD_U32::~Inst_VOP3__V_LSHL_ADD_U32()
+    {
+    } // ~Inst_VOP3__V_LSHL_ADD_U32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u << S1.u[4:0]) + S2.u.
+    void
+    Inst_VOP3__V_LSHL_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (src0[lane] << bits(src1[lane], 4, 0))
+                           + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ADD_LSHL_U32 class methods ---
+
+    Inst_VOP3__V_ADD_LSHL_U32::Inst_VOP3__V_ADD_LSHL_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_add_lshl_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_ADD_LSHL_U32
+
+    Inst_VOP3__V_ADD_LSHL_U32::~Inst_VOP3__V_ADD_LSHL_U32()
+    {
+    } // ~Inst_VOP3__V_ADD_LSHL_U32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u + S1.u) << S2.u[4:0].
+    void
+    Inst_VOP3__V_ADD_LSHL_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] =
+                    (src0[lane] + src1[lane]) << bits(src2[lane], 4, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ADD3_U32 class methods ---
+
+    Inst_VOP3__V_ADD3_U32::Inst_VOP3__V_ADD3_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_add3_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_ADD3_U32
+
+    Inst_VOP3__V_ADD3_U32::~Inst_VOP3__V_ADD3_U32()
+    {
+    } // ~Inst_VOP3__V_ADD3_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u + S1.u + S2.u.
+    void
+    Inst_VOP3__V_ADD3_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] + src1[lane] + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LSHL_OR_B32 class methods ---
+
+    Inst_VOP3__V_LSHL_OR_B32::Inst_VOP3__V_LSHL_OR_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lshl_or_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LSHL_OR_B32
+
+    Inst_VOP3__V_LSHL_OR_B32::~Inst_VOP3__V_LSHL_OR_B32()
+    {
+    } // ~Inst_VOP3__V_LSHL_OR_B32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u << S1.u[4:0]) | S2.u.
+    void
+    Inst_VOP3__V_LSHL_OR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (src0[lane] << bits(src1[lane], 4, 0))
+                           | src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_AND_OR_B32 class methods ---
+
+    Inst_VOP3__V_AND_OR_B32::Inst_VOP3__V_AND_OR_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_and_or_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_AND_OR_B32
+
+    Inst_VOP3__V_AND_OR_B32::~Inst_VOP3__V_AND_OR_B32()
+    {
+    } // ~Inst_VOP3__V_AND_OR_B32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u & S1.u) | S2.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP3__V_AND_OR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (src0[lane] & src1[lane]) | src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAD_F16 class methods ---
+
+    Inst_VOP3__V_MAD_F16::Inst_VOP3__V_MAD_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mad_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(MAD);
+    } // Inst_VOP3__V_MAD_F16
+
+    Inst_VOP3__V_MAD_F16::~Inst_VOP3__V_MAD_F16()
+    {
+    } // ~Inst_VOP3__V_MAD_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * S1.f16 + S2.f16.
+    // Supports round mode, exception flags, saturation.
+    void
+    Inst_VOP3__V_MAD_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_MAD_U16 class methods ---
+
+    Inst_VOP3__V_MAD_U16::Inst_VOP3__V_MAD_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mad_u16", false)
+    {
+        setFlag(ALU);
+        setFlag(MAD);
+    } // Inst_VOP3__V_MAD_U16
+
+    Inst_VOP3__V_MAD_U16::~Inst_VOP3__V_MAD_U16()
+    {
+    } // ~Inst_VOP3__V_MAD_U16
+
+    // --- description from .arch file ---
+    // D.u16 = S0.u16 * S1.u16 + S2.u16.
+    // Supports saturation (unsigned 16-bit integer domain).
+    void
+    Inst_VOP3__V_MAD_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU16 src2(gpuDynInst, extData.SRC2);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] * src1[lane] + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAD_I16 class methods ---
+
+    Inst_VOP3__V_MAD_I16::Inst_VOP3__V_MAD_I16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mad_i16", false)
+    {
+        setFlag(ALU);
+        setFlag(MAD);
+    } // Inst_VOP3__V_MAD_I16
+
+    Inst_VOP3__V_MAD_I16::~Inst_VOP3__V_MAD_I16()
+    {
+    } // ~Inst_VOP3__V_MAD_I16
+
+    // --- description from .arch file ---
+    // D.i16 = S0.i16 * S1.i16 + S2.i16.
+    // Supports saturation (signed 16-bit integer domain).
+    void
+    Inst_VOP3__V_MAD_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandI16 src2(gpuDynInst, extData.SRC2);
+        VecOperandI16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] * src1[lane] + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_PERM_B32 class methods ---
+
+    Inst_VOP3__V_PERM_B32::Inst_VOP3__V_PERM_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_perm_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_PERM_B32
+
+    Inst_VOP3__V_PERM_B32::~Inst_VOP3__V_PERM_B32()
+    {
+    } // ~Inst_VOP3__V_PERM_B32
+
+    // --- description from .arch file ---
+    // D.u[31:24] = permute({S0.u, S1.u}, S2.u[31:24]);
+    // D.u[23:16] = permute({S0.u, S1.u}, S2.u[23:16]);
+    // D.u[15:8] = permute({S0.u, S1.u}, S2.u[15:8]);
+    // D.u[7:0] = permute({S0.u, S1.u}, S2.u[7:0]);
+    // byte permute(byte in[8], byte sel) {
+    //     if (sel>=13) then return 0xff;
+    //     elsif(sel==12) then return 0x00;
+    //     elsif(sel==11) then return in[7][7] * 0xff;
+    //     elsif(sel==10) then return in[5][7] * 0xff;
+    //     elsif(sel==9) then return in[3][7] * 0xff;
+    //     elsif(sel==8) then return in[1][7] * 0xff;
+    //     else return in[sel];
+    //     }
+    // Byte permute.
+    void
+    Inst_VOP3__V_PERM_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemU64 selector = (VecElemU64)src0[lane];
+                selector = (selector << 32) | (VecElemU64)src1[lane];
+                vdst[lane] = 0;
+
+                DPRINTF(VEGA, "Executing v_perm_b32 src_0 0x%08x, src_1 "
+                        "0x%08x, src_2 0x%08x, vdst 0x%08x\n", src0[lane],
+                        src1[lane], src2[lane], vdst[lane]);
+                DPRINTF(VEGA, "Selector: 0x%08x \n", selector);
+
+                for (int i = 0; i < 4 ; ++i) {
+                    VecElemU32 permuted_val = permute(selector, 0xFF
+                        & ((VecElemU32)src2[lane] >> (8 * i)));
+                    vdst[lane] |= (permuted_val << (8 * i));
+                }
+
+                DPRINTF(VEGA, "v_perm result: 0x%08x\n", vdst[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FMA_F16 class methods ---
+
+    Inst_VOP3__V_FMA_F16::Inst_VOP3__V_FMA_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_fma_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(FMA);
+    } // Inst_VOP3__V_FMA_F16
+
+    Inst_VOP3__V_FMA_F16::~Inst_VOP3__V_FMA_F16()
+    {
+    } // ~Inst_VOP3__V_FMA_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * S1.f16 + S2.f16.
+    // Fused half precision multiply add.
+    void
+    Inst_VOP3__V_FMA_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_DIV_FIXUP_F16 class methods ---
+
+    Inst_VOP3__V_DIV_FIXUP_F16::Inst_VOP3__V_DIV_FIXUP_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_div_fixup_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_DIV_FIXUP_F16
+
+    Inst_VOP3__V_DIV_FIXUP_F16::~Inst_VOP3__V_DIV_FIXUP_F16()
+    {
+    } // ~Inst_VOP3__V_DIV_FIXUP_F16
+
+    // --- description from .arch file ---
+    // sign_out =  sign(S1.f16)^sign(S2.f16);
+    // if (S2.f16 == NAN)
+    //     D.f16 = Quiet(S2.f16);
+    // else if (S1.f16 == NAN)
+    //     D.f16 = Quiet(S1.f16);
+    // else if (S1.f16 == S2.f16 == 0)
+    //     # 0/0
+    //     D.f16 = pele_nan(0xfe00);
+    // else if (abs(S1.f16) == abs(S2.f16) == +-INF)
+    //     # inf/inf
+    //     D.f16 = pele_nan(0xfe00);
+    // else if (S1.f16 ==0 || abs(S2.f16) == +-INF)
+    //     # x/0, or inf/y
+    //     D.f16 = sign_out ? -INF : INF;
+    // else if (abs(S1.f16) == +-INF || S2.f16 == 0)
+    //     # x/inf, 0/y
+    //     D.f16 = sign_out ? -0 : 0;
+    // else if ((exp(S2.f16) - exp(S1.f16)) < -150)
+    //     D.f16 = sign_out ? -underflow : underflow;
+    // else if (exp(S1.f16) == 255)
+    //     D.f16 = sign_out ? -overflow : overflow;
+    // else
+    //     D.f16 = sign_out ? -abs(S0.f16) : abs(S0.f16).
+    // Half precision division fixup.
+    // S0 = Quotient, S1 = Denominator, S3 = Numerator.
+    // Given a numerator, denominator, and quotient from a divide, this opcode
+    // will detect and apply special case numerics, touching up the quotient if
+    // necessary. This opcode also generates invalid, denorm and divide by
+    // zero exceptions caused by the division.
+    void
+    Inst_VOP3__V_DIV_FIXUP_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_PKACCUM_U8_F32 class methods ---
+
+    Inst_VOP3__V_CVT_PKACCUM_U8_F32::Inst_VOP3__V_CVT_PKACCUM_U8_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_pkaccum_u8_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_PKACCUM_U8_F32
+
+    Inst_VOP3__V_CVT_PKACCUM_U8_F32::~Inst_VOP3__V_CVT_PKACCUM_U8_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_PKACCUM_U8_F32
+
+    // --- description from .arch file ---
+    // byte = S1.u[1:0]; bit = byte * 8;
+    // D.u[bit+7:bit] = flt32_to_uint8(S0.f);
+    // Pack converted value of S0.f into byte S1 of the destination.
+    // SQ translates to V_CVT_PK_U8_F32.
+    // Note: this opcode uses src_c to pass destination in as a source.
+    void
+    Inst_VOP3__V_CVT_PKACCUM_U8_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_INTERP_P1_F32 class methods ---
+
+    Inst_VOP3__V_INTERP_P1_F32::Inst_VOP3__V_INTERP_P1_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_interp_p1_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_INTERP_P1_F32
+
+    Inst_VOP3__V_INTERP_P1_F32::~Inst_VOP3__V_INTERP_P1_F32()
+    {
+    } // ~Inst_VOP3__V_INTERP_P1_F32
+
+    // --- description from .arch file ---
+    // D.f = P10 * S.f + P0; parameter interpolation (SQ translates to
+    // V_MAD_F32 for SP).
+    // CAUTION: when in HALF_LDS mode, D must not be the same GPR as S; if
+    // D == S then data corruption will occur.
+    // NOTE: In textual representations the I/J VGPR is the first source and
+    // the attribute is the second source; however in the VOP3 encoding the
+    // attribute is stored in the src0 field and the VGPR is stored in the
+    // src1 field.
+    void
+    Inst_VOP3__V_INTERP_P1_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_INTERP_P2_F32 class methods ---
+
+    Inst_VOP3__V_INTERP_P2_F32::Inst_VOP3__V_INTERP_P2_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_interp_p2_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_INTERP_P2_F32
+
+    Inst_VOP3__V_INTERP_P2_F32::~Inst_VOP3__V_INTERP_P2_F32()
+    {
+    } // ~Inst_VOP3__V_INTERP_P2_F32
+
+    // --- description from .arch file ---
+    // D.f = P20 * S.f + D.f; parameter interpolation (SQ translates to
+    // V_MAD_F32 for SP).
+    // NOTE: In textual representations the I/J VGPR is the first source and
+    // the attribute is the second source; however in the VOP3 encoding the
+    // attribute is stored in the src0 field and the VGPR is stored in the
+    // src1 field.
+    void
+    Inst_VOP3__V_INTERP_P2_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_INTERP_MOV_F32 class methods ---
+
+    Inst_VOP3__V_INTERP_MOV_F32::Inst_VOP3__V_INTERP_MOV_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_interp_mov_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_INTERP_MOV_F32
+
+    Inst_VOP3__V_INTERP_MOV_F32::~Inst_VOP3__V_INTERP_MOV_F32()
+    {
+    } // ~Inst_VOP3__V_INTERP_MOV_F32
+
+    // --- description from .arch file ---
+    // D.f = {P10,P20,P0}[S.u]; parameter load.
+    void
+    Inst_VOP3__V_INTERP_MOV_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_INTERP_P1LL_F16 class methods ---
+
+    Inst_VOP3__V_INTERP_P1LL_F16::Inst_VOP3__V_INTERP_P1LL_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_interp_p1ll_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_INTERP_P1LL_F16
+
+    Inst_VOP3__V_INTERP_P1LL_F16::~Inst_VOP3__V_INTERP_P1LL_F16()
+    {
+    } // ~Inst_VOP3__V_INTERP_P1LL_F16
+
+    // --- description from .arch file ---
+    // D.f32 = P10.f16 * S0.f32 + P0.f16.
+    // 'LL' stands for 'two LDS arguments'.
+    // attr_word selects the high or low half 16 bits of each LDS dword
+    // accessed.
+    // This opcode is available for 32-bank LDS only.
+    // NOTE: In textual representations the I/J VGPR is the first source and
+    // the attribute is the second source; however in the VOP3 encoding the
+    // attribute is stored in the src0 field and the VGPR is stored in the
+    // src1 field.
+    void
+    Inst_VOP3__V_INTERP_P1LL_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_INTERP_P1LV_F16 class methods ---
+
+    Inst_VOP3__V_INTERP_P1LV_F16::Inst_VOP3__V_INTERP_P1LV_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_interp_p1lv_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_INTERP_P1LV_F16
+
+    Inst_VOP3__V_INTERP_P1LV_F16::~Inst_VOP3__V_INTERP_P1LV_F16()
+    {
+    } // ~Inst_VOP3__V_INTERP_P1LV_F16
+
+    // --- description from .arch file ---
+    // D.f32 = P10.f16 * S0.f32 + (S2.u32 >> (attr_word * 16)).f16.
+    // 'LV' stands for 'One LDS and one VGPR argument'.
+    // S2 holds two parameters, attr_word selects the high or low word of the
+    // VGPR for this calculation, as well as the high or low half of the LDS
+    // data.
+    // Meant for use with 16-bank LDS.
+    // NOTE: In textual representations the I/J VGPR is the first source and
+    // the attribute is the second source; however in the VOP3 encoding the
+    // attribute is stored in the src0 field and the VGPR is stored in the
+    // src1 field.
+    void
+    Inst_VOP3__V_INTERP_P1LV_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_INTERP_P2_F16 class methods ---
+
+    Inst_VOP3__V_INTERP_P2_F16::Inst_VOP3__V_INTERP_P2_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_interp_p2_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_INTERP_P2_F16
+
+    Inst_VOP3__V_INTERP_P2_F16::~Inst_VOP3__V_INTERP_P2_F16()
+    {
+    } // ~Inst_VOP3__V_INTERP_P2_F16
+
+    // --- description from .arch file ---
+    // D.f16 = P20.f16 * S0.f32 + S2.f32.
+    // Final computation. attr_word selects LDS high or low 16bits. Used for
+    // both 16- and 32-bank LDS.
+    // Result is always written to the 16 LSBs of the destination VGPR.
+    // NOTE: In textual representations the I/J VGPR is the first source and
+    // the attribute is the second source; however in the VOP3 encoding the
+    // attribute is stored in the src0 field and the VGPR is stored in the
+    // src1 field.
+    void
+    Inst_VOP3__V_INTERP_P2_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_ADD_F64 class methods ---
+
+    Inst_VOP3__V_ADD_F64::Inst_VOP3__V_ADD_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_add_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_ADD_F64
+
+    Inst_VOP3__V_ADD_F64::~Inst_VOP3__V_ADD_F64()
+    {
+    } // ~Inst_VOP3__V_ADD_F64
+
+    // --- description from .arch file ---
+    // D.d = S0.d + S1.d.
+    void
+    Inst_VOP3__V_ADD_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isnan(src0[lane]) ||
+                    std::isnan(src1[lane]) ) {
+                        vdst[lane] = NAN;
+                } else if (std::isinf(src0[lane]) &&
+                           std::isinf(src1[lane])) {
+                    if (std::signbit(src0[lane]) !=
+                        std::signbit(src1[lane])) {
+                        vdst[lane] = NAN;
+                    } else {
+                        vdst[lane] = src0[lane];
+                    }
+                } else if (std::isinf(src0[lane])) {
+                    vdst[lane] = src0[lane];
+                } else if (std::isinf(src1[lane])) {
+                    vdst[lane] = src1[lane];
+                } else if (std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src0[lane]) == FP_ZERO) {
+                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src1[lane]) == FP_ZERO) {
+                        if (std::signbit(src0[lane]) &&
+                            std::signbit(src1[lane])) {
+                            vdst[lane] = -0.0;
+                        } else {
+                            vdst[lane] = 0.0;
+                        }
+                    } else {
+                        vdst[lane] = src1[lane];
+                    }
+                } else if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src1[lane]) == FP_ZERO) {
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src0[lane]) == FP_ZERO) {
+                        if (std::signbit(src0[lane]) &&
+                            std::signbit(src1[lane])) {
+                            vdst[lane] = -0.0;
+                        } else {
+                            vdst[lane] = 0.0;
+                        }
+                    } else {
+                        vdst[lane] = src0[lane];
+                    }
+                } else {
+                    vdst[lane] = src0[lane] + src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_F64 class methods ---
+
+    Inst_VOP3__V_MUL_F64::Inst_VOP3__V_MUL_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_MUL_F64
+
+    Inst_VOP3__V_MUL_F64::~Inst_VOP3__V_MUL_F64()
+    {
+    } // ~Inst_VOP3__V_MUL_F64
+
+    // --- description from .arch file ---
+    // D.d = S0.d * S1.d.
+    void
+    Inst_VOP3__V_MUL_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isnan(src0[lane]) ||
+                    std::isnan(src1[lane])) {
+                    vdst[lane] = NAN;
+                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src0[lane]) == FP_ZERO) &&
+                           !std::signbit(src0[lane])) {
+                    if (std::isinf(src1[lane])) {
+                        vdst[lane] = NAN;
+                    } else if (!std::signbit(src1[lane])) {
+                        vdst[lane] = +0.0;
+                    } else {
+                        vdst[lane] = -0.0;
+                    }
+                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src0[lane]) == FP_ZERO) &&
+                           std::signbit(src0[lane])) {
+                    if (std::isinf(src1[lane])) {
+                        vdst[lane] = NAN;
+                    } else if (std::signbit(src1[lane])) {
+                        vdst[lane] = +0.0;
+                    } else {
+                        vdst[lane] = -0.0;
+                    }
+                } else if (std::isinf(src0[lane]) &&
+                           !std::signbit(src0[lane])) {
+                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src1[lane]) == FP_ZERO) {
+                        vdst[lane] = NAN;
+                    } else if (!std::signbit(src1[lane])) {
+                        vdst[lane] = +INFINITY;
+                    } else {
+                        vdst[lane] = -INFINITY;
+                    }
+                } else if (std::isinf(src0[lane]) &&
+                           std::signbit(src0[lane])) {
+                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src1[lane]) == FP_ZERO) {
+                        vdst[lane] = NAN;
+                    } else if (std::signbit(src1[lane])) {
+                        vdst[lane] = +INFINITY;
+                    } else {
+                        vdst[lane] = -INFINITY;
+                    }
+                } else {
+                    vdst[lane] = src0[lane] * src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MIN_F64 class methods ---
+
+    Inst_VOP3__V_MIN_F64::Inst_VOP3__V_MIN_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_MIN_F64
+
+    Inst_VOP3__V_MIN_F64::~Inst_VOP3__V_MIN_F64()
+    {
+    } // ~Inst_VOP3__V_MIN_F64
+
+    // --- description from .arch file ---
+    // D.d = min(S0.d, S1.d).
+    void
+    Inst_VOP3__V_MIN_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fmin(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAX_F64 class methods ---
+
+    Inst_VOP3__V_MAX_F64::Inst_VOP3__V_MAX_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_MAX_F64
+
+    Inst_VOP3__V_MAX_F64::~Inst_VOP3__V_MAX_F64()
+    {
+    } // ~Inst_VOP3__V_MAX_F64
+
+    // --- description from .arch file ---
+    // D.d = max(S0.d, S1.d).
+    void
+    Inst_VOP3__V_MAX_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fmax(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LDEXP_F64 class methods ---
+
+    Inst_VOP3__V_LDEXP_F64::Inst_VOP3__V_LDEXP_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ldexp_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_LDEXP_F64
+
+    Inst_VOP3__V_LDEXP_F64::~Inst_VOP3__V_LDEXP_F64()
+    {
+    } // ~Inst_VOP3__V_LDEXP_F64
+
+    // --- description from .arch file ---
+    // D.d = pow(S0.d, S1.i[31:0]).
+    void
+    Inst_VOP3__V_LDEXP_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isnan(src0[lane]) || std::isinf(src0[lane])) {
+                    vdst[lane] = src0[lane];
+                } else if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                           || std::fpclassify(src0[lane]) == FP_ZERO) {
+                    if (std::signbit(src0[lane])) {
+                        vdst[lane] = -0.0;
+                    } else {
+                        vdst[lane] = +0.0;
+                    }
+                } else {
+                    vdst[lane] = std::ldexp(src0[lane], src1[lane]);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_LO_U32 class methods ---
+
+    Inst_VOP3__V_MUL_LO_U32::Inst_VOP3__V_MUL_LO_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_lo_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MUL_LO_U32
+
+    Inst_VOP3__V_MUL_LO_U32::~Inst_VOP3__V_MUL_LO_U32()
+    {
+    } // ~Inst_VOP3__V_MUL_LO_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u * S1.u.
+    void
+    Inst_VOP3__V_MUL_LO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemI64 s0 = (VecElemI64)src0[lane];
+                VecElemI64 s1 = (VecElemI64)src1[lane];
+                vdst[lane] = (VecElemU32)((s0 * s1) & 0xffffffffLL);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_HI_U32 class methods ---
+
+    Inst_VOP3__V_MUL_HI_U32::Inst_VOP3__V_MUL_HI_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_hi_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MUL_HI_U32
+
+    Inst_VOP3__V_MUL_HI_U32::~Inst_VOP3__V_MUL_HI_U32()
+    {
+    } // ~Inst_VOP3__V_MUL_HI_U32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u * S1.u) >> 32.
+    void
+    Inst_VOP3__V_MUL_HI_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemI64 s0 = (VecElemI64)src0[lane];
+                VecElemI64 s1 = (VecElemI64)src1[lane];
+                vdst[lane]
+                    = (VecElemU32)(((s0 * s1) >> 32) & 0xffffffffLL);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_HI_I32 class methods ---
+
+    Inst_VOP3__V_MUL_HI_I32::Inst_VOP3__V_MUL_HI_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_hi_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MUL_HI_I32
+
+    Inst_VOP3__V_MUL_HI_I32::~Inst_VOP3__V_MUL_HI_I32()
+    {
+    } // ~Inst_VOP3__V_MUL_HI_I32
+
+    // --- description from .arch file ---
+    // D.i = (S0.i * S1.i) >> 32.
+    void
+    Inst_VOP3__V_MUL_HI_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemI64 s0 = (VecElemI64)src0[lane];
+                VecElemI64 s1 = (VecElemI64)src1[lane];
+                vdst[lane]
+                    = (VecElemI32)(((s0 * s1) >> 32LL) & 0xffffffffLL);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LDEXP_F32 class methods ---
+
+    Inst_VOP3__V_LDEXP_F32::Inst_VOP3__V_LDEXP_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ldexp_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_LDEXP_F32
+
+    Inst_VOP3__V_LDEXP_F32::~Inst_VOP3__V_LDEXP_F32()
+    {
+    } // ~Inst_VOP3__V_LDEXP_F32
+
+    // --- description from .arch file ---
+    // D.f = pow(S0.f, S1.i)
+    void
+    Inst_VOP3__V_LDEXP_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::ldexp(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_READLANE_B32 class methods ---
+
+    Inst_VOP3__V_READLANE_B32::Inst_VOP3__V_READLANE_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_readlane_b32", true)
+    {
+        setFlag(ALU);
+        setFlag(IgnoreExec);
+    } // Inst_VOP3__V_READLANE_B32
+
+    Inst_VOP3__V_READLANE_B32::~Inst_VOP3__V_READLANE_B32()
+    {
+    } // ~Inst_VOP3__V_READLANE_B32
+
+    // --- description from .arch file ---
+    // Copy one VGPR value to one SGPR. D = SGPR-dest, S0 = Source Data (VGPR#
+    // or M0(lds-direct)), S1 = Lane Select (SGPR or M0). Ignores exec mask.
+    // Input and output modifiers not supported; this is an untyped operation.
+    void
+    Inst_VOP3__V_READLANE_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        sdst = src0[src1.rawData() & 0x3f];
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_WRITELANE_B32 class methods ---
+
+    Inst_VOP3__V_WRITELANE_B32::Inst_VOP3__V_WRITELANE_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_writelane_b32", false)
+    {
+        setFlag(ALU);
+        setFlag(IgnoreExec);
+    } // Inst_VOP3__V_WRITELANE_B32
+
+    Inst_VOP3__V_WRITELANE_B32::~Inst_VOP3__V_WRITELANE_B32()
+    {
+    } // ~Inst_VOP3__V_WRITELANE_B32
+
+    // --- description from .arch file ---
+    // Write value into one VGPR in one lane. D = VGPR-dest, S0 = Source Data
+    // (sgpr, m0, exec or constants), S1 = Lane Select (SGPR or M0). Ignores
+    // exec mask.
+    // Input and output modifiers not supported; this is an untyped operation.
+    // SQ translates to V_MOV_B32.
+    void
+    Inst_VOP3__V_WRITELANE_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.read();
+        src1.read();
+        vdst.read();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        vdst[src1.rawData() & 0x3f] = src0.rawData();
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_BCNT_U32_B32 class methods ---
+
+    Inst_VOP3__V_BCNT_U32_B32::Inst_VOP3__V_BCNT_U32_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_bcnt_u32_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_BCNT_U32_B32
+
+    Inst_VOP3__V_BCNT_U32_B32::~Inst_VOP3__V_BCNT_U32_B32()
+    {
+    } // ~Inst_VOP3__V_BCNT_U32_B32
+
+    // --- description from .arch file ---
+    // D.u = CountOneBits(S0.u) + S1.u. Bit count.
+    void
+    Inst_VOP3__V_BCNT_U32_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = popCount(src0[lane]) + src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MBCNT_LO_U32_B32 class methods ---
+
+    Inst_VOP3__V_MBCNT_LO_U32_B32::Inst_VOP3__V_MBCNT_LO_U32_B32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mbcnt_lo_u32_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MBCNT_LO_U32_B32
+
+    Inst_VOP3__V_MBCNT_LO_U32_B32::~Inst_VOP3__V_MBCNT_LO_U32_B32()
+    {
+    } // ~Inst_VOP3__V_MBCNT_LO_U32_B32
+
+    // --- description from .arch file ---
+    // ThreadMask = (1 << ThreadPosition) - 1;
+    // D.u = CountOneBits(S0.u & ThreadMask[31:0]) + S1.u.
+    // Masked bit count, ThreadPosition is the position of this thread in the
+    // ---  wavefront (in 0..63).
+    void
+    Inst_VOP3__V_MBCNT_LO_U32_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        uint64_t threadMask = 0;
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                threadMask = ((1LL << lane) - 1LL);
+                vdst[lane] = popCount(src0[lane] & bits(threadMask, 31, 0)) +
+                             src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MBCNT_HI_U32_B32 class methods ---
+
+    Inst_VOP3__V_MBCNT_HI_U32_B32::Inst_VOP3__V_MBCNT_HI_U32_B32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mbcnt_hi_u32_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MBCNT_HI_U32_B32
+
+    Inst_VOP3__V_MBCNT_HI_U32_B32::~Inst_VOP3__V_MBCNT_HI_U32_B32()
+    {
+    } // ~Inst_VOP3__V_MBCNT_HI_U32_B32
+
+    // --- description from .arch file ---
+    // ThreadMask = (1 << ThreadPosition) - 1;
+    // D.u = CountOneBits(S0.u & ThreadMask[63:32]) + S1.u.
+    // Masked bit count, ThreadPosition is the position of this thread in the
+    // ---  wavefront (in 0..63).
+    void
+    Inst_VOP3__V_MBCNT_HI_U32_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        uint64_t threadMask = 0;
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                threadMask = ((1LL << lane) - 1LL);
+                vdst[lane] = popCount(src0[lane] & bits(threadMask, 63, 32)) +
+                             src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LSHLREV_B64 class methods ---
+
+    Inst_VOP3__V_LSHLREV_B64::Inst_VOP3__V_LSHLREV_B64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lshlrev_b64", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LSHLREV_B64
+
+    Inst_VOP3__V_LSHLREV_B64::~Inst_VOP3__V_LSHLREV_B64()
+    {
+    } // ~Inst_VOP3__V_LSHLREV_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S1.u64 << S0.u[5:0].
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP3__V_LSHLREV_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] << bits(src0[lane], 5, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LSHRREV_B64 class methods ---
+
+    Inst_VOP3__V_LSHRREV_B64::Inst_VOP3__V_LSHRREV_B64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lshrrev_b64", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LSHRREV_B64
+
+    Inst_VOP3__V_LSHRREV_B64::~Inst_VOP3__V_LSHRREV_B64()
+    {
+    } // ~Inst_VOP3__V_LSHRREV_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S1.u64 >> S0.u[5:0].
+    // The vacated bits are set to zero.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP3__V_LSHRREV_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] >> bits(src0[lane], 5, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ASHRREV_I64 class methods ---
+
+    Inst_VOP3__V_ASHRREV_I64::Inst_VOP3__V_ASHRREV_I64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ashrrev_i64", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_ASHRREV_I64
+
+    Inst_VOP3__V_ASHRREV_I64::~Inst_VOP3__V_ASHRREV_I64()
+    {
+    } // ~Inst_VOP3__V_ASHRREV_I64
+
+    // --- description from .arch file ---
+    // D.u64 = signext(S1.u64) >> S0.u[5:0].
+    // The vacated bits are set to the sign bit of the input value.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP3__V_ASHRREV_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane]
+                    = src1[lane] >> bits(src0[lane], 5, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_TRIG_PREOP_F64 class methods ---
+
+    Inst_VOP3__V_TRIG_PREOP_F64::Inst_VOP3__V_TRIG_PREOP_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_trig_preop_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_TRIG_PREOP_F64
+
+    Inst_VOP3__V_TRIG_PREOP_F64::~Inst_VOP3__V_TRIG_PREOP_F64()
+    {
+    } // ~Inst_VOP3__V_TRIG_PREOP_F64
+
+    // --- description from .arch file ---
+    // D.d = Look Up 2/PI (S0.d) with segment select S1.u[4:0]. This operation
+    // returns an aligned, double precision segment of 2/PI needed to do range
+    // reduction on S0.d (double-precision value). Multiple segments can be
+    // specified through S1.u[4:0]. Rounding is always round-to-zero. Large
+    // inputs (exp > 1968) are scaled to avoid loss of precision through
+    // denormalization.
+    void
+    Inst_VOP3__V_TRIG_PREOP_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_BFM_B32 class methods ---
+
+    Inst_VOP3__V_BFM_B32::Inst_VOP3__V_BFM_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_bfm_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_BFM_B32
+
+    Inst_VOP3__V_BFM_B32::~Inst_VOP3__V_BFM_B32()
+    {
+    } // ~Inst_VOP3__V_BFM_B32
+
+    // --- description from .arch file ---
+    // D.u = ((1<<S0.u[4:0])-1) << S1.u[4:0]; S0 is the bitfield width and S1
+    // is the bitfield offset.
+    void
+    Inst_VOP3__V_BFM_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = ((1 << bits(src0[lane], 4, 0)) - 1)
+                    << bits(src1[lane], 4, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_PKNORM_I16_F32 class methods ---
+
+    Inst_VOP3__V_CVT_PKNORM_I16_F32::Inst_VOP3__V_CVT_PKNORM_I16_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_pknorm_i16_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_PKNORM_I16_F32
+
+    Inst_VOP3__V_CVT_PKNORM_I16_F32::~Inst_VOP3__V_CVT_PKNORM_I16_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_PKNORM_I16_F32
+
+    // --- description from .arch file ---
+    // D = {(snorm)S1.f, (snorm)S0.f}.
+    void
+    Inst_VOP3__V_CVT_PKNORM_I16_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_PKNORM_U16_F32 class methods ---
+
+    Inst_VOP3__V_CVT_PKNORM_U16_F32::Inst_VOP3__V_CVT_PKNORM_U16_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_pknorm_u16_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_PKNORM_U16_F32
+
+    Inst_VOP3__V_CVT_PKNORM_U16_F32::~Inst_VOP3__V_CVT_PKNORM_U16_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_PKNORM_U16_F32
+
+    // --- description from .arch file ---
+    // D = {(unorm)S1.f, (unorm)S0.f}.
+    void
+    Inst_VOP3__V_CVT_PKNORM_U16_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_PKRTZ_F16_F32 class methods ---
+
+    Inst_VOP3__V_CVT_PKRTZ_F16_F32::Inst_VOP3__V_CVT_PKRTZ_F16_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_pkrtz_f16_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_PKRTZ_F16_F32
+
+    Inst_VOP3__V_CVT_PKRTZ_F16_F32::~Inst_VOP3__V_CVT_PKRTZ_F16_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_PKRTZ_F16_F32
+
+    // --- description from .arch file ---
+    // D = {flt32_to_flt16(S1.f),flt32_to_flt16(S0.f)}, with round-toward-zero
+    // ---  regardless of current round mode setting in hardware.
+    // This opcode is intended for use with 16-bit compressed exports.
+    // See V_CVT_F16_F32 for a version that respects the current rounding mode.
+    void
+    Inst_VOP3__V_CVT_PKRTZ_F16_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_PK_U16_U32 class methods ---
+
+    Inst_VOP3__V_CVT_PK_U16_U32::Inst_VOP3__V_CVT_PK_U16_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_pk_u16_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CVT_PK_U16_U32
+
+    Inst_VOP3__V_CVT_PK_U16_U32::~Inst_VOP3__V_CVT_PK_U16_U32()
+    {
+    } // ~Inst_VOP3__V_CVT_PK_U16_U32
+
+    // --- description from .arch file ---
+    // D = {uint32_to_uint16(S1.u), uint32_to_uint16(S0.u)}.
+    void
+    Inst_VOP3__V_CVT_PK_U16_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_PK_I16_I32 class methods ---
+
+    Inst_VOP3__V_CVT_PK_I16_I32::Inst_VOP3__V_CVT_PK_I16_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_pk_i16_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CVT_PK_I16_I32
+
+    Inst_VOP3__V_CVT_PK_I16_I32::~Inst_VOP3__V_CVT_PK_I16_I32()
+    {
+    } // ~Inst_VOP3__V_CVT_PK_I16_I32
+
+    // --- description from .arch file ---
+    // D = {int32_to_int16(S1.i), int32_to_int16(S0.i)}.
+    void
+    Inst_VOP3__V_CVT_PK_I16_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/vop3_cmp.cc b/src/arch/amdgpu/vega/insts/vop3_cmp.cc
new file mode 100644
index 0000000000..4bbec930e6
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/vop3_cmp.cc
@@ -0,0 +1,8145 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/inst_util.hh"
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_VOP3__V_CMP_CLASS_F32 class methods ---
+
+    Inst_VOP3__V_CMP_CLASS_F32::Inst_VOP3__V_CMP_CLASS_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_class_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_CLASS_F32
+
+    Inst_VOP3__V_CMP_CLASS_F32::~Inst_VOP3__V_CMP_CLASS_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_CLASS_F32
+
+    // --- description from .arch file ---
+    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f
+    // The function reports true if the floating point value is *any* of the
+    // ---  numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOP3__V_CMP_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
+                    // is NaN
+                    if (std::isnan(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 2)) {
+                    // is -infinity
+                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 3)) {
+                    // is -normal
+                    if (std::isnormal(src0[lane])
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 4)) {
+                    // is -denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 5)) {
+                    // is -zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 6)) {
+                    // is +zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 7)) {
+                    // is +denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 8)) {
+                    // is +normal
+                    if (std::isnormal(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 9)) {
+                    // is +infinity
+                    if (std::isinf(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_CLASS_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_CLASS_F32::Inst_VOP3__V_CMPX_CLASS_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_class_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_CLASS_F32
+
+    Inst_VOP3__V_CMPX_CLASS_F32::~Inst_VOP3__V_CMPX_CLASS_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_CLASS_F32
+
+    // --- description from .arch file ---
+    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
+    // S0.f
+    // The function reports true if the floating point value is *any* of the
+    // numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOP3__V_CMPX_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
+                    // is NaN
+                    if (std::isnan(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 2)) {
+                    // is -infinity
+                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 3)) {
+                    // is -normal
+                    if (std::isnormal(src0[lane])
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 4)) {
+                    // is -denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 5)) {
+                    // is -zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 6)) {
+                    // is +zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 7)) {
+                    // is +denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 8)) {
+                    // is +normal
+                    if (std::isnormal(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 9)) {
+                    // is +infinity
+                    if (std::isinf(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_CLASS_F64 class methods ---
+
+    Inst_VOP3__V_CMP_CLASS_F64::Inst_VOP3__V_CMP_CLASS_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_class_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_CLASS_F64
+
+    Inst_VOP3__V_CMP_CLASS_F64::~Inst_VOP3__V_CMP_CLASS_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_CLASS_F64
+
+    // --- description from .arch file ---
+    // VCC = IEEE numeric class function specified in S1.u, performed on S0.d
+    // The function reports true if the floating point value is *any* of the
+    // ---  numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOP3__V_CMP_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
+                    // is NaN
+                    if (std::isnan(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 2)) {
+                    // is -infinity
+                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 3)) {
+                    // is -normal
+                    if (std::isnormal(src0[lane])
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 4)) {
+                    // is -denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 5)) {
+                    // is -zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 6)) {
+                    // is +zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 7)) {
+                    // is +denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 8)) {
+                    // is +normal
+                    if (std::isnormal(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 9)) {
+                    // is +infinity
+                    if (std::isinf(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_CLASS_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_CLASS_F64::Inst_VOP3__V_CMPX_CLASS_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_class_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_CLASS_F64
+
+    Inst_VOP3__V_CMPX_CLASS_F64::~Inst_VOP3__V_CMPX_CLASS_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_CLASS_F64
+
+    // --- description from .arch file ---
+    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
+    // S0.d
+    // The function reports true if the floating point value is *any* of the
+    // numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOP3__V_CMPX_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
+                    // is NaN
+                    if (std::isnan(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 2)) {
+                    // is -infinity
+                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 3)) {
+                    // is -normal
+                    if (std::isnormal(src0[lane])
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 4)) {
+                    // is -denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 5)) {
+                    // is -zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 6)) {
+                    // is +zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 7)) {
+                    // is +denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 8)) {
+                    // is +normal
+                    if (std::isnormal(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 9)) {
+                    // is +infinity
+                    if (std::isinf(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_CLASS_F16 class methods ---
+
+    Inst_VOP3__V_CMP_CLASS_F16::Inst_VOP3__V_CMP_CLASS_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_class_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_CLASS_F16
+
+    Inst_VOP3__V_CMP_CLASS_F16::~Inst_VOP3__V_CMP_CLASS_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_CLASS_F16
+
+    // --- description from .arch file ---
+    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f16
+    // The function reports true if the floating point value is *any* of the
+    // ---  numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOP3__V_CMP_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_CLASS_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_CLASS_F16::Inst_VOP3__V_CMPX_CLASS_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_class_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_CLASS_F16
+
+    Inst_VOP3__V_CMPX_CLASS_F16::~Inst_VOP3__V_CMPX_CLASS_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_CLASS_F16
+
+    // --- description from .arch file ---
+    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
+    // ---  S0.f16
+    // The function reports true if the floating point value is *any* of the
+    // ---  numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOP3__V_CMPX_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_F_F16 class methods ---
+
+    Inst_VOP3__V_CMP_F_F16::Inst_VOP3__V_CMP_F_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_f_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_F_F16
+
+    Inst_VOP3__V_CMP_F_F16::~Inst_VOP3__V_CMP_F_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_F_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_F_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LT_F16 class methods ---
+
+    Inst_VOP3__V_CMP_LT_F16::Inst_VOP3__V_CMP_LT_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lt_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_LT_F16
+
+    Inst_VOP3__V_CMP_LT_F16::~Inst_VOP3__V_CMP_LT_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_LT_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_EQ_F16 class methods ---
+
+    Inst_VOP3__V_CMP_EQ_F16::Inst_VOP3__V_CMP_EQ_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_eq_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_EQ_F16
+
+    Inst_VOP3__V_CMP_EQ_F16::~Inst_VOP3__V_CMP_EQ_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_EQ_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LE_F16 class methods ---
+
+    Inst_VOP3__V_CMP_LE_F16::Inst_VOP3__V_CMP_LE_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_le_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_LE_F16
+
+    Inst_VOP3__V_CMP_LE_F16::~Inst_VOP3__V_CMP_LE_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_LE_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GT_F16 class methods ---
+
+    Inst_VOP3__V_CMP_GT_F16::Inst_VOP3__V_CMP_GT_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_gt_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_GT_F16
+
+    Inst_VOP3__V_CMP_GT_F16::~Inst_VOP3__V_CMP_GT_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_GT_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LG_F16 class methods ---
+
+    Inst_VOP3__V_CMP_LG_F16::Inst_VOP3__V_CMP_LG_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lg_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_LG_F16
+
+    Inst_VOP3__V_CMP_LG_F16::~Inst_VOP3__V_CMP_LG_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_LG_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GE_F16 class methods ---
+
+    Inst_VOP3__V_CMP_GE_F16::Inst_VOP3__V_CMP_GE_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ge_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_GE_F16
+
+    Inst_VOP3__V_CMP_GE_F16::~Inst_VOP3__V_CMP_GE_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_GE_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_O_F16 class methods ---
+
+    Inst_VOP3__V_CMP_O_F16::Inst_VOP3__V_CMP_O_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_o_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_O_F16
+
+    Inst_VOP3__V_CMP_O_F16::~Inst_VOP3__V_CMP_O_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_O_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_O_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_U_F16 class methods ---
+
+    Inst_VOP3__V_CMP_U_F16::Inst_VOP3__V_CMP_U_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_u_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_U_F16
+
+    Inst_VOP3__V_CMP_U_F16::~Inst_VOP3__V_CMP_U_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_U_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_U_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NGE_F16 class methods ---
+
+    Inst_VOP3__V_CMP_NGE_F16::Inst_VOP3__V_CMP_NGE_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nge_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_NGE_F16
+
+    Inst_VOP3__V_CMP_NGE_F16::~Inst_VOP3__V_CMP_NGE_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_NGE_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NLG_F16 class methods ---
+
+    Inst_VOP3__V_CMP_NLG_F16::Inst_VOP3__V_CMP_NLG_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nlg_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_NLG_F16
+
+    Inst_VOP3__V_CMP_NLG_F16::~Inst_VOP3__V_CMP_NLG_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_NLG_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NGT_F16 class methods ---
+
+    Inst_VOP3__V_CMP_NGT_F16::Inst_VOP3__V_CMP_NGT_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ngt_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_NGT_F16
+
+    Inst_VOP3__V_CMP_NGT_F16::~Inst_VOP3__V_CMP_NGT_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_NGT_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NLE_F16 class methods ---
+
+    Inst_VOP3__V_CMP_NLE_F16::Inst_VOP3__V_CMP_NLE_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nle_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_NLE_F16
+
+    Inst_VOP3__V_CMP_NLE_F16::~Inst_VOP3__V_CMP_NLE_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_NLE_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NEQ_F16 class methods ---
+
+    Inst_VOP3__V_CMP_NEQ_F16::Inst_VOP3__V_CMP_NEQ_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_neq_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_NEQ_F16
+
+    Inst_VOP3__V_CMP_NEQ_F16::~Inst_VOP3__V_CMP_NEQ_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_NEQ_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NLT_F16 class methods ---
+
+    Inst_VOP3__V_CMP_NLT_F16::Inst_VOP3__V_CMP_NLT_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nlt_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_NLT_F16
+
+    Inst_VOP3__V_CMP_NLT_F16::~Inst_VOP3__V_CMP_NLT_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_NLT_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_TRU_F16 class methods ---
+
+    Inst_VOP3__V_CMP_TRU_F16::Inst_VOP3__V_CMP_TRU_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_tru_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_TRU_F16
+
+    Inst_VOP3__V_CMP_TRU_F16::~Inst_VOP3__V_CMP_TRU_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_TRU_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_F_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_F_F16::Inst_VOP3__V_CMPX_F_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_f_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_F_F16
+
+    Inst_VOP3__V_CMPX_F_F16::~Inst_VOP3__V_CMPX_F_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_F_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_F_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LT_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_LT_F16::Inst_VOP3__V_CMPX_LT_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lt_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LT_F16
+
+    Inst_VOP3__V_CMPX_LT_F16::~Inst_VOP3__V_CMPX_LT_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_LT_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_EQ_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_EQ_F16::Inst_VOP3__V_CMPX_EQ_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_eq_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_EQ_F16
+
+    Inst_VOP3__V_CMPX_EQ_F16::~Inst_VOP3__V_CMPX_EQ_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_EQ_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LE_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_LE_F16::Inst_VOP3__V_CMPX_LE_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_le_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LE_F16
+
+    Inst_VOP3__V_CMPX_LE_F16::~Inst_VOP3__V_CMPX_LE_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_LE_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GT_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_GT_F16::Inst_VOP3__V_CMPX_GT_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_gt_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GT_F16
+
+    Inst_VOP3__V_CMPX_GT_F16::~Inst_VOP3__V_CMPX_GT_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_GT_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LG_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_LG_F16::Inst_VOP3__V_CMPX_LG_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lg_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LG_F16
+
+    Inst_VOP3__V_CMPX_LG_F16::~Inst_VOP3__V_CMPX_LG_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_LG_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GE_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_GE_F16::Inst_VOP3__V_CMPX_GE_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ge_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GE_F16
+
+    Inst_VOP3__V_CMPX_GE_F16::~Inst_VOP3__V_CMPX_GE_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_GE_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_O_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_O_F16::Inst_VOP3__V_CMPX_O_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_o_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_O_F16
+
+    Inst_VOP3__V_CMPX_O_F16::~Inst_VOP3__V_CMPX_O_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_O_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOP3__V_CMPX_O_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_U_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_U_F16::Inst_VOP3__V_CMPX_U_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_u_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_U_F16
+
+    Inst_VOP3__V_CMPX_U_F16::~Inst_VOP3__V_CMPX_U_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_U_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOP3__V_CMPX_U_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NGE_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_NGE_F16::Inst_VOP3__V_CMPX_NGE_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nge_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NGE_F16
+
+    Inst_VOP3__V_CMPX_NGE_F16::~Inst_VOP3__V_CMPX_NGE_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_NGE_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NLG_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_NLG_F16::Inst_VOP3__V_CMPX_NLG_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nlg_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NLG_F16
+
+    Inst_VOP3__V_CMPX_NLG_F16::~Inst_VOP3__V_CMPX_NLG_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_NLG_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NGT_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_NGT_F16::Inst_VOP3__V_CMPX_NGT_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ngt_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NGT_F16
+
+    Inst_VOP3__V_CMPX_NGT_F16::~Inst_VOP3__V_CMPX_NGT_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_NGT_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NLE_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_NLE_F16::Inst_VOP3__V_CMPX_NLE_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nle_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NLE_F16
+
+    Inst_VOP3__V_CMPX_NLE_F16::~Inst_VOP3__V_CMPX_NLE_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_NLE_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NEQ_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_NEQ_F16::Inst_VOP3__V_CMPX_NEQ_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_neq_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NEQ_F16
+
+    Inst_VOP3__V_CMPX_NEQ_F16::~Inst_VOP3__V_CMPX_NEQ_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_NEQ_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NLT_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_NLT_F16::Inst_VOP3__V_CMPX_NLT_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nlt_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NLT_F16
+
+    Inst_VOP3__V_CMPX_NLT_F16::~Inst_VOP3__V_CMPX_NLT_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_NLT_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_TRU_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_TRU_F16::Inst_VOP3__V_CMPX_TRU_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_tru_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_TRU_F16
+
+    Inst_VOP3__V_CMPX_TRU_F16::~Inst_VOP3__V_CMPX_TRU_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_TRU_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_F_F32 class methods ---
+
+    Inst_VOP3__V_CMP_F_F32::Inst_VOP3__V_CMP_F_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_f_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_F_F32
+
+    Inst_VOP3__V_CMP_F_F32::~Inst_VOP3__V_CMP_F_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_F_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_F_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LT_F32 class methods ---
+
+    Inst_VOP3__V_CMP_LT_F32::Inst_VOP3__V_CMP_LT_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lt_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_LT_F32
+
+    Inst_VOP3__V_CMP_LT_F32::~Inst_VOP3__V_CMP_LT_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_LT_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_EQ_F32 class methods ---
+
+    Inst_VOP3__V_CMP_EQ_F32::Inst_VOP3__V_CMP_EQ_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_eq_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_EQ_F32
+
+    Inst_VOP3__V_CMP_EQ_F32::~Inst_VOP3__V_CMP_EQ_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_EQ_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LE_F32 class methods ---
+
+    Inst_VOP3__V_CMP_LE_F32::Inst_VOP3__V_CMP_LE_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_le_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_LE_F32
+
+    Inst_VOP3__V_CMP_LE_F32::~Inst_VOP3__V_CMP_LE_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_LE_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GT_F32 class methods ---
+
+    Inst_VOP3__V_CMP_GT_F32::Inst_VOP3__V_CMP_GT_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_gt_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_GT_F32
+
+    Inst_VOP3__V_CMP_GT_F32::~Inst_VOP3__V_CMP_GT_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_GT_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LG_F32 class methods ---
+
+    Inst_VOP3__V_CMP_LG_F32::Inst_VOP3__V_CMP_LG_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lg_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_LG_F32
+
+    Inst_VOP3__V_CMP_LG_F32::~Inst_VOP3__V_CMP_LG_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_LG_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GE_F32 class methods ---
+
+    Inst_VOP3__V_CMP_GE_F32::Inst_VOP3__V_CMP_GE_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ge_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_GE_F32
+
+    Inst_VOP3__V_CMP_GE_F32::~Inst_VOP3__V_CMP_GE_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_GE_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_O_F32 class methods ---
+
+    Inst_VOP3__V_CMP_O_F32::Inst_VOP3__V_CMP_O_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_o_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_O_F32
+
+    Inst_VOP3__V_CMP_O_F32::~Inst_VOP3__V_CMP_O_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_O_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_O_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (!std::isnan(src0[lane])
+                    && !std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_U_F32 class methods ---
+
+    Inst_VOP3__V_CMP_U_F32::Inst_VOP3__V_CMP_U_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_u_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_U_F32
+
+    Inst_VOP3__V_CMP_U_F32::~Inst_VOP3__V_CMP_U_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_U_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_U_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (std::isnan(src0[lane])
+                    || std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NGE_F32 class methods ---
+
+    Inst_VOP3__V_CMP_NGE_F32::Inst_VOP3__V_CMP_NGE_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nge_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_NGE_F32
+
+    Inst_VOP3__V_CMP_NGE_F32::~Inst_VOP3__V_CMP_NGE_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_NGE_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NLG_F32 class methods ---
+
+    Inst_VOP3__V_CMP_NLG_F32::Inst_VOP3__V_CMP_NLG_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nlg_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_NLG_F32
+
+    Inst_VOP3__V_CMP_NLG_F32::~Inst_VOP3__V_CMP_NLG_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_NLG_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NGT_F32 class methods ---
+
+    Inst_VOP3__V_CMP_NGT_F32::Inst_VOP3__V_CMP_NGT_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ngt_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_NGT_F32
+
+    Inst_VOP3__V_CMP_NGT_F32::~Inst_VOP3__V_CMP_NGT_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_NGT_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NLE_F32 class methods ---
+
+    Inst_VOP3__V_CMP_NLE_F32::Inst_VOP3__V_CMP_NLE_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nle_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_NLE_F32
+
+    Inst_VOP3__V_CMP_NLE_F32::~Inst_VOP3__V_CMP_NLE_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_NLE_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NEQ_F32 class methods ---
+
+    Inst_VOP3__V_CMP_NEQ_F32::Inst_VOP3__V_CMP_NEQ_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_neq_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_NEQ_F32
+
+    Inst_VOP3__V_CMP_NEQ_F32::~Inst_VOP3__V_CMP_NEQ_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_NEQ_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NLT_F32 class methods ---
+
+    Inst_VOP3__V_CMP_NLT_F32::Inst_VOP3__V_CMP_NLT_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nlt_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_NLT_F32
+
+    Inst_VOP3__V_CMP_NLT_F32::~Inst_VOP3__V_CMP_NLT_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_NLT_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_TRU_F32 class methods ---
+
+    Inst_VOP3__V_CMP_TRU_F32::Inst_VOP3__V_CMP_TRU_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_tru_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_TRU_F32
+
+    Inst_VOP3__V_CMP_TRU_F32::~Inst_VOP3__V_CMP_TRU_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_TRU_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_F_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_F_F32::Inst_VOP3__V_CMPX_F_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_f_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_F_F32
+
+    Inst_VOP3__V_CMPX_F_F32::~Inst_VOP3__V_CMPX_F_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_F_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_F_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LT_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_LT_F32::Inst_VOP3__V_CMPX_LT_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lt_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LT_F32
+
+    Inst_VOP3__V_CMPX_LT_F32::~Inst_VOP3__V_CMPX_LT_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_LT_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_EQ_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_EQ_F32::Inst_VOP3__V_CMPX_EQ_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_eq_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_EQ_F32
+
+    Inst_VOP3__V_CMPX_EQ_F32::~Inst_VOP3__V_CMPX_EQ_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_EQ_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LE_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_LE_F32::Inst_VOP3__V_CMPX_LE_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_le_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LE_F32
+
+    Inst_VOP3__V_CMPX_LE_F32::~Inst_VOP3__V_CMPX_LE_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_LE_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GT_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_GT_F32::Inst_VOP3__V_CMPX_GT_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_gt_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GT_F32
+
+    Inst_VOP3__V_CMPX_GT_F32::~Inst_VOP3__V_CMPX_GT_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_GT_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LG_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_LG_F32::Inst_VOP3__V_CMPX_LG_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lg_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LG_F32
+
+    Inst_VOP3__V_CMPX_LG_F32::~Inst_VOP3__V_CMPX_LG_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_LG_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GE_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_GE_F32::Inst_VOP3__V_CMPX_GE_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ge_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GE_F32
+
+    Inst_VOP3__V_CMPX_GE_F32::~Inst_VOP3__V_CMPX_GE_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_GE_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_O_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_O_F32::Inst_VOP3__V_CMPX_O_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_o_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_O_F32
+
+    Inst_VOP3__V_CMPX_O_F32::~Inst_VOP3__V_CMPX_O_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_O_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOP3__V_CMPX_O_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (!std::isnan(src0[lane])
+                    && !std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_U_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_U_F32::Inst_VOP3__V_CMPX_U_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_u_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_U_F32
+
+    Inst_VOP3__V_CMPX_U_F32::~Inst_VOP3__V_CMPX_U_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_U_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOP3__V_CMPX_U_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (std::isnan(src0[lane])
+                        || std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NGE_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_NGE_F32::Inst_VOP3__V_CMPX_NGE_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nge_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NGE_F32
+
+    Inst_VOP3__V_CMPX_NGE_F32::~Inst_VOP3__V_CMPX_NGE_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_NGE_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NLG_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_NLG_F32::Inst_VOP3__V_CMPX_NLG_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nlg_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NLG_F32
+
+    Inst_VOP3__V_CMPX_NLG_F32::~Inst_VOP3__V_CMPX_NLG_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_NLG_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NGT_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_NGT_F32::Inst_VOP3__V_CMPX_NGT_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ngt_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NGT_F32
+
+    Inst_VOP3__V_CMPX_NGT_F32::~Inst_VOP3__V_CMPX_NGT_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_NGT_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NLE_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_NLE_F32::Inst_VOP3__V_CMPX_NLE_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nle_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NLE_F32
+
+    Inst_VOP3__V_CMPX_NLE_F32::~Inst_VOP3__V_CMPX_NLE_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_NLE_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NEQ_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_NEQ_F32::Inst_VOP3__V_CMPX_NEQ_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_neq_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NEQ_F32
+
+    Inst_VOP3__V_CMPX_NEQ_F32::~Inst_VOP3__V_CMPX_NEQ_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_NEQ_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NLT_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_NLT_F32::Inst_VOP3__V_CMPX_NLT_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nlt_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NLT_F32
+
+    Inst_VOP3__V_CMPX_NLT_F32::~Inst_VOP3__V_CMPX_NLT_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_NLT_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_TRU_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_TRU_F32::Inst_VOP3__V_CMPX_TRU_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_tru_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_TRU_F32
+
+    Inst_VOP3__V_CMPX_TRU_F32::~Inst_VOP3__V_CMPX_TRU_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_TRU_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_F_F64 class methods ---
+
+    Inst_VOP3__V_CMP_F_F64::Inst_VOP3__V_CMP_F_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_f_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_F_F64
+
+    Inst_VOP3__V_CMP_F_F64::~Inst_VOP3__V_CMP_F_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_F_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_F_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LT_F64 class methods ---
+
+    Inst_VOP3__V_CMP_LT_F64::Inst_VOP3__V_CMP_LT_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lt_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_LT_F64
+
+    Inst_VOP3__V_CMP_LT_F64::~Inst_VOP3__V_CMP_LT_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_LT_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_EQ_F64 class methods ---
+
+    Inst_VOP3__V_CMP_EQ_F64::Inst_VOP3__V_CMP_EQ_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_eq_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_EQ_F64
+
+    Inst_VOP3__V_CMP_EQ_F64::~Inst_VOP3__V_CMP_EQ_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_EQ_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LE_F64 class methods ---
+
+    Inst_VOP3__V_CMP_LE_F64::Inst_VOP3__V_CMP_LE_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_le_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_LE_F64
+
+    Inst_VOP3__V_CMP_LE_F64::~Inst_VOP3__V_CMP_LE_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_LE_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GT_F64 class methods ---
+
+    Inst_VOP3__V_CMP_GT_F64::Inst_VOP3__V_CMP_GT_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_gt_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_GT_F64
+
+    Inst_VOP3__V_CMP_GT_F64::~Inst_VOP3__V_CMP_GT_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_GT_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LG_F64 class methods ---
+
+    Inst_VOP3__V_CMP_LG_F64::Inst_VOP3__V_CMP_LG_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lg_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_LG_F64
+
+    Inst_VOP3__V_CMP_LG_F64::~Inst_VOP3__V_CMP_LG_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_LG_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LG_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GE_F64 class methods ---
+
+    Inst_VOP3__V_CMP_GE_F64::Inst_VOP3__V_CMP_GE_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ge_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_GE_F64
+
+    Inst_VOP3__V_CMP_GE_F64::~Inst_VOP3__V_CMP_GE_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_GE_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_O_F64 class methods ---
+
+    Inst_VOP3__V_CMP_O_F64::Inst_VOP3__V_CMP_O_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_o_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_O_F64
+
+    Inst_VOP3__V_CMP_O_F64::~Inst_VOP3__V_CMP_O_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_O_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_O_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (!std::isnan(src0[lane])
+                    && !std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_U_F64 class methods ---
+
+    Inst_VOP3__V_CMP_U_F64::Inst_VOP3__V_CMP_U_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_u_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_U_F64
+
+    Inst_VOP3__V_CMP_U_F64::~Inst_VOP3__V_CMP_U_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_U_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_U_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (std::isnan(src0[lane])
+                    || std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NGE_F64 class methods ---
+
+    Inst_VOP3__V_CMP_NGE_F64::Inst_VOP3__V_CMP_NGE_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nge_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_NGE_F64
+
+    Inst_VOP3__V_CMP_NGE_F64::~Inst_VOP3__V_CMP_NGE_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_NGE_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NLG_F64 class methods ---
+
+    Inst_VOP3__V_CMP_NLG_F64::Inst_VOP3__V_CMP_NLG_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nlg_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_NLG_F64
+
+    Inst_VOP3__V_CMP_NLG_F64::~Inst_VOP3__V_CMP_NLG_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_NLG_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NGT_F64 class methods ---
+
+    Inst_VOP3__V_CMP_NGT_F64::Inst_VOP3__V_CMP_NGT_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ngt_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_NGT_F64
+
+    Inst_VOP3__V_CMP_NGT_F64::~Inst_VOP3__V_CMP_NGT_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_NGT_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NLE_F64 class methods ---
+
+    Inst_VOP3__V_CMP_NLE_F64::Inst_VOP3__V_CMP_NLE_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nle_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_NLE_F64
+
+    Inst_VOP3__V_CMP_NLE_F64::~Inst_VOP3__V_CMP_NLE_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_NLE_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NEQ_F64 class methods ---
+
+    Inst_VOP3__V_CMP_NEQ_F64::Inst_VOP3__V_CMP_NEQ_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_neq_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_NEQ_F64
+
+    Inst_VOP3__V_CMP_NEQ_F64::~Inst_VOP3__V_CMP_NEQ_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_NEQ_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NLT_F64 class methods ---
+
+    Inst_VOP3__V_CMP_NLT_F64::Inst_VOP3__V_CMP_NLT_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nlt_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_NLT_F64
+
+    Inst_VOP3__V_CMP_NLT_F64::~Inst_VOP3__V_CMP_NLT_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_NLT_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_TRU_F64 class methods ---
+
+    Inst_VOP3__V_CMP_TRU_F64::Inst_VOP3__V_CMP_TRU_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_tru_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_TRU_F64
+
+    Inst_VOP3__V_CMP_TRU_F64::~Inst_VOP3__V_CMP_TRU_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_TRU_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_F_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_F_F64::Inst_VOP3__V_CMPX_F_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_f_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_F_F64
+
+    Inst_VOP3__V_CMPX_F_F64::~Inst_VOP3__V_CMPX_F_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_F_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_F_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LT_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_LT_F64::Inst_VOP3__V_CMPX_LT_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lt_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LT_F64
+
+    Inst_VOP3__V_CMPX_LT_F64::~Inst_VOP3__V_CMPX_LT_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_LT_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_EQ_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_EQ_F64::Inst_VOP3__V_CMPX_EQ_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_eq_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_EQ_F64
+
+    Inst_VOP3__V_CMPX_EQ_F64::~Inst_VOP3__V_CMPX_EQ_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_EQ_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LE_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_LE_F64::Inst_VOP3__V_CMPX_LE_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_le_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LE_F64
+
+    Inst_VOP3__V_CMPX_LE_F64::~Inst_VOP3__V_CMPX_LE_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_LE_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GT_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_GT_F64::Inst_VOP3__V_CMPX_GT_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_gt_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GT_F64
+
+    Inst_VOP3__V_CMPX_GT_F64::~Inst_VOP3__V_CMPX_GT_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_GT_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LG_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_LG_F64::Inst_VOP3__V_CMPX_LG_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lg_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LG_F64
+
+    Inst_VOP3__V_CMPX_LG_F64::~Inst_VOP3__V_CMPX_LG_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_LG_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LG_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GE_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_GE_F64::Inst_VOP3__V_CMPX_GE_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ge_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GE_F64
+
+    Inst_VOP3__V_CMPX_GE_F64::~Inst_VOP3__V_CMPX_GE_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_GE_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_O_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_O_F64::Inst_VOP3__V_CMPX_O_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_o_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_O_F64
+
+    Inst_VOP3__V_CMPX_O_F64::~Inst_VOP3__V_CMPX_O_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_O_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOP3__V_CMPX_O_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (!std::isnan(src0[lane])
+                    && !std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_U_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_U_F64::Inst_VOP3__V_CMPX_U_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_u_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_U_F64
+
+    Inst_VOP3__V_CMPX_U_F64::~Inst_VOP3__V_CMPX_U_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_U_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOP3__V_CMPX_U_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (std::isnan(src0[lane])
+                    || std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NGE_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_NGE_F64::Inst_VOP3__V_CMPX_NGE_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nge_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NGE_F64
+
+    Inst_VOP3__V_CMPX_NGE_F64::~Inst_VOP3__V_CMPX_NGE_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_NGE_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NLG_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_NLG_F64::Inst_VOP3__V_CMPX_NLG_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nlg_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NLG_F64
+
+    Inst_VOP3__V_CMPX_NLG_F64::~Inst_VOP3__V_CMPX_NLG_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_NLG_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NGT_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_NGT_F64::Inst_VOP3__V_CMPX_NGT_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ngt_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NGT_F64
+
+    Inst_VOP3__V_CMPX_NGT_F64::~Inst_VOP3__V_CMPX_NGT_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_NGT_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NLE_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_NLE_F64::Inst_VOP3__V_CMPX_NLE_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nle_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NLE_F64
+
+    Inst_VOP3__V_CMPX_NLE_F64::~Inst_VOP3__V_CMPX_NLE_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_NLE_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NEQ_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_NEQ_F64::Inst_VOP3__V_CMPX_NEQ_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_neq_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NEQ_F64
+
+    Inst_VOP3__V_CMPX_NEQ_F64::~Inst_VOP3__V_CMPX_NEQ_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_NEQ_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NLT_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_NLT_F64::Inst_VOP3__V_CMPX_NLT_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nlt_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NLT_F64
+
+    Inst_VOP3__V_CMPX_NLT_F64::~Inst_VOP3__V_CMPX_NLT_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_NLT_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_TRU_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_TRU_F64::Inst_VOP3__V_CMPX_TRU_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_tru_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_TRU_F64
+
+    Inst_VOP3__V_CMPX_TRU_F64::~Inst_VOP3__V_CMPX_TRU_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_TRU_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_F_I16 class methods ---
+
+    Inst_VOP3__V_CMP_F_I16::Inst_VOP3__V_CMP_F_I16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_f_i16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_F_I16
+
+    Inst_VOP3__V_CMP_F_I16::~Inst_VOP3__V_CMP_F_I16()
+    {
+    } // ~Inst_VOP3__V_CMP_F_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_F_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LT_I16 class methods ---
+
+    Inst_VOP3__V_CMP_LT_I16::Inst_VOP3__V_CMP_LT_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lt_i16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LT_I16
+
+    Inst_VOP3__V_CMP_LT_I16::~Inst_VOP3__V_CMP_LT_I16()
+    {
+    } // ~Inst_VOP3__V_CMP_LT_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LT_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_EQ_I16 class methods ---
+
+    Inst_VOP3__V_CMP_EQ_I16::Inst_VOP3__V_CMP_EQ_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_eq_i16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_EQ_I16
+
+    Inst_VOP3__V_CMP_EQ_I16::~Inst_VOP3__V_CMP_EQ_I16()
+    {
+    } // ~Inst_VOP3__V_CMP_EQ_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LE_I16 class methods ---
+
+    Inst_VOP3__V_CMP_LE_I16::Inst_VOP3__V_CMP_LE_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_le_i16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LE_I16
+
+    Inst_VOP3__V_CMP_LE_I16::~Inst_VOP3__V_CMP_LE_I16()
+    {
+    } // ~Inst_VOP3__V_CMP_LE_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GT_I16 class methods ---
+
+    Inst_VOP3__V_CMP_GT_I16::Inst_VOP3__V_CMP_GT_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_gt_i16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GT_I16
+
+    Inst_VOP3__V_CMP_GT_I16::~Inst_VOP3__V_CMP_GT_I16()
+    {
+    } // ~Inst_VOP3__V_CMP_GT_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GT_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NE_I16 class methods ---
+
+    Inst_VOP3__V_CMP_NE_I16::Inst_VOP3__V_CMP_NE_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ne_i16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_NE_I16
+
+    Inst_VOP3__V_CMP_NE_I16::~Inst_VOP3__V_CMP_NE_I16()
+    {
+    } // ~Inst_VOP3__V_CMP_NE_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GE_I16 class methods ---
+
+    Inst_VOP3__V_CMP_GE_I16::Inst_VOP3__V_CMP_GE_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ge_i16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GE_I16
+
+    Inst_VOP3__V_CMP_GE_I16::~Inst_VOP3__V_CMP_GE_I16()
+    {
+    } // ~Inst_VOP3__V_CMP_GE_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_T_I16 class methods ---
+
+    Inst_VOP3__V_CMP_T_I16::Inst_VOP3__V_CMP_T_I16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_t_i16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_T_I16
+
+    Inst_VOP3__V_CMP_T_I16::~Inst_VOP3__V_CMP_T_I16()
+    {
+    } // ~Inst_VOP3__V_CMP_T_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_T_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_F_U16 class methods ---
+
+    Inst_VOP3__V_CMP_F_U16::Inst_VOP3__V_CMP_F_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_f_u16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_F_U16
+
+    Inst_VOP3__V_CMP_F_U16::~Inst_VOP3__V_CMP_F_U16()
+    {
+    } // ~Inst_VOP3__V_CMP_F_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_F_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LT_U16 class methods ---
+
+    Inst_VOP3__V_CMP_LT_U16::Inst_VOP3__V_CMP_LT_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lt_u16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LT_U16
+
+    Inst_VOP3__V_CMP_LT_U16::~Inst_VOP3__V_CMP_LT_U16()
+    {
+    } // ~Inst_VOP3__V_CMP_LT_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LT_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_EQ_U16 class methods ---
+
+    Inst_VOP3__V_CMP_EQ_U16::Inst_VOP3__V_CMP_EQ_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_eq_u16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_EQ_U16
+
+    Inst_VOP3__V_CMP_EQ_U16::~Inst_VOP3__V_CMP_EQ_U16()
+    {
+    } // ~Inst_VOP3__V_CMP_EQ_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LE_U16 class methods ---
+
+    Inst_VOP3__V_CMP_LE_U16::Inst_VOP3__V_CMP_LE_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_le_u16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LE_U16
+
+    Inst_VOP3__V_CMP_LE_U16::~Inst_VOP3__V_CMP_LE_U16()
+    {
+    } // ~Inst_VOP3__V_CMP_LE_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GT_U16 class methods ---
+
+    Inst_VOP3__V_CMP_GT_U16::Inst_VOP3__V_CMP_GT_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_gt_u16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GT_U16
+
+    Inst_VOP3__V_CMP_GT_U16::~Inst_VOP3__V_CMP_GT_U16()
+    {
+    } // ~Inst_VOP3__V_CMP_GT_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GT_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NE_U16 class methods ---
+
+    Inst_VOP3__V_CMP_NE_U16::Inst_VOP3__V_CMP_NE_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ne_u16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_NE_U16
+
+    Inst_VOP3__V_CMP_NE_U16::~Inst_VOP3__V_CMP_NE_U16()
+    {
+    } // ~Inst_VOP3__V_CMP_NE_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GE_U16 class methods ---
+
+    Inst_VOP3__V_CMP_GE_U16::Inst_VOP3__V_CMP_GE_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ge_u16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GE_U16
+
+    Inst_VOP3__V_CMP_GE_U16::~Inst_VOP3__V_CMP_GE_U16()
+    {
+    } // ~Inst_VOP3__V_CMP_GE_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_T_U16 class methods ---
+
+    Inst_VOP3__V_CMP_T_U16::Inst_VOP3__V_CMP_T_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_t_u16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_T_U16
+
+    Inst_VOP3__V_CMP_T_U16::~Inst_VOP3__V_CMP_T_U16()
+    {
+    } // ~Inst_VOP3__V_CMP_T_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_T_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_F_I16 class methods ---
+
+    Inst_VOP3__V_CMPX_F_I16::Inst_VOP3__V_CMPX_F_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_f_i16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_F_I16
+
+    Inst_VOP3__V_CMPX_F_I16::~Inst_VOP3__V_CMPX_F_I16()
+    {
+    } // ~Inst_VOP3__V_CMPX_F_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_F_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LT_I16 class methods ---
+
+    Inst_VOP3__V_CMPX_LT_I16::Inst_VOP3__V_CMPX_LT_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lt_i16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LT_I16
+
+    Inst_VOP3__V_CMPX_LT_I16::~Inst_VOP3__V_CMPX_LT_I16()
+    {
+    } // ~Inst_VOP3__V_CMPX_LT_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LT_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_EQ_I16 class methods ---
+
+    Inst_VOP3__V_CMPX_EQ_I16::Inst_VOP3__V_CMPX_EQ_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_eq_i16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_EQ_I16
+
+    Inst_VOP3__V_CMPX_EQ_I16::~Inst_VOP3__V_CMPX_EQ_I16()
+    {
+    } // ~Inst_VOP3__V_CMPX_EQ_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LE_I16 class methods ---
+
+    Inst_VOP3__V_CMPX_LE_I16::Inst_VOP3__V_CMPX_LE_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_le_i16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LE_I16
+
+    Inst_VOP3__V_CMPX_LE_I16::~Inst_VOP3__V_CMPX_LE_I16()
+    {
+    } // ~Inst_VOP3__V_CMPX_LE_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GT_I16 class methods ---
+
+    Inst_VOP3__V_CMPX_GT_I16::Inst_VOP3__V_CMPX_GT_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_gt_i16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GT_I16
+
+    Inst_VOP3__V_CMPX_GT_I16::~Inst_VOP3__V_CMPX_GT_I16()
+    {
+    } // ~Inst_VOP3__V_CMPX_GT_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GT_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NE_I16 class methods ---
+
+    Inst_VOP3__V_CMPX_NE_I16::Inst_VOP3__V_CMPX_NE_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ne_i16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NE_I16
+
+    Inst_VOP3__V_CMPX_NE_I16::~Inst_VOP3__V_CMPX_NE_I16()
+    {
+    } // ~Inst_VOP3__V_CMPX_NE_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GE_I16 class methods ---
+
+    Inst_VOP3__V_CMPX_GE_I16::Inst_VOP3__V_CMPX_GE_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ge_i16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GE_I16
+
+    Inst_VOP3__V_CMPX_GE_I16::~Inst_VOP3__V_CMPX_GE_I16()
+    {
+    } // ~Inst_VOP3__V_CMPX_GE_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_T_I16 class methods ---
+
+    Inst_VOP3__V_CMPX_T_I16::Inst_VOP3__V_CMPX_T_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_t_i16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_T_I16
+
+    Inst_VOP3__V_CMPX_T_I16::~Inst_VOP3__V_CMPX_T_I16()
+    {
+    } // ~Inst_VOP3__V_CMPX_T_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_T_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_F_U16 class methods ---
+
+    Inst_VOP3__V_CMPX_F_U16::Inst_VOP3__V_CMPX_F_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_f_u16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_F_U16
+
+    Inst_VOP3__V_CMPX_F_U16::~Inst_VOP3__V_CMPX_F_U16()
+    {
+    } // ~Inst_VOP3__V_CMPX_F_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_F_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LT_U16 class methods ---
+
+    Inst_VOP3__V_CMPX_LT_U16::Inst_VOP3__V_CMPX_LT_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lt_u16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LT_U16
+
+    Inst_VOP3__V_CMPX_LT_U16::~Inst_VOP3__V_CMPX_LT_U16()
+    {
+    } // ~Inst_VOP3__V_CMPX_LT_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LT_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_EQ_U16 class methods ---
+
+    Inst_VOP3__V_CMPX_EQ_U16::Inst_VOP3__V_CMPX_EQ_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_eq_u16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_EQ_U16
+
+    Inst_VOP3__V_CMPX_EQ_U16::~Inst_VOP3__V_CMPX_EQ_U16()
+    {
+    } // ~Inst_VOP3__V_CMPX_EQ_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LE_U16 class methods ---
+
+    Inst_VOP3__V_CMPX_LE_U16::Inst_VOP3__V_CMPX_LE_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_le_u16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LE_U16
+
+    Inst_VOP3__V_CMPX_LE_U16::~Inst_VOP3__V_CMPX_LE_U16()
+    {
+    } // ~Inst_VOP3__V_CMPX_LE_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GT_U16 class methods ---
+
+    Inst_VOP3__V_CMPX_GT_U16::Inst_VOP3__V_CMPX_GT_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_gt_u16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GT_U16
+
+    Inst_VOP3__V_CMPX_GT_U16::~Inst_VOP3__V_CMPX_GT_U16()
+    {
+    } // ~Inst_VOP3__V_CMPX_GT_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GT_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NE_U16 class methods ---
+
+    Inst_VOP3__V_CMPX_NE_U16::Inst_VOP3__V_CMPX_NE_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ne_u16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NE_U16
+
+    Inst_VOP3__V_CMPX_NE_U16::~Inst_VOP3__V_CMPX_NE_U16()
+    {
+    } // ~Inst_VOP3__V_CMPX_NE_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GE_U16 class methods ---
+
+    Inst_VOP3__V_CMPX_GE_U16::Inst_VOP3__V_CMPX_GE_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ge_u16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GE_U16
+
+    Inst_VOP3__V_CMPX_GE_U16::~Inst_VOP3__V_CMPX_GE_U16()
+    {
+    } // ~Inst_VOP3__V_CMPX_GE_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_T_U16 class methods ---
+
+    Inst_VOP3__V_CMPX_T_U16::Inst_VOP3__V_CMPX_T_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_t_u16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_T_U16
+
+    Inst_VOP3__V_CMPX_T_U16::~Inst_VOP3__V_CMPX_T_U16()
+    {
+    } // ~Inst_VOP3__V_CMPX_T_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_T_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_F_I32 class methods ---
+
+    Inst_VOP3__V_CMP_F_I32::Inst_VOP3__V_CMP_F_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_f_i32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_F_I32
+
+    Inst_VOP3__V_CMP_F_I32::~Inst_VOP3__V_CMP_F_I32()
+    {
+    } // ~Inst_VOP3__V_CMP_F_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_F_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LT_I32 class methods ---
+
+    Inst_VOP3__V_CMP_LT_I32::Inst_VOP3__V_CMP_LT_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lt_i32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LT_I32
+
+    Inst_VOP3__V_CMP_LT_I32::~Inst_VOP3__V_CMP_LT_I32()
+    {
+    } // ~Inst_VOP3__V_CMP_LT_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_EQ_I32 class methods ---
+
+    Inst_VOP3__V_CMP_EQ_I32::Inst_VOP3__V_CMP_EQ_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_eq_i32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_EQ_I32
+
+    Inst_VOP3__V_CMP_EQ_I32::~Inst_VOP3__V_CMP_EQ_I32()
+    {
+    } // ~Inst_VOP3__V_CMP_EQ_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LE_I32 class methods ---
+
+    Inst_VOP3__V_CMP_LE_I32::Inst_VOP3__V_CMP_LE_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_le_i32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LE_I32
+
+    Inst_VOP3__V_CMP_LE_I32::~Inst_VOP3__V_CMP_LE_I32()
+    {
+    } // ~Inst_VOP3__V_CMP_LE_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GT_I32 class methods ---
+
+    Inst_VOP3__V_CMP_GT_I32::Inst_VOP3__V_CMP_GT_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_gt_i32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GT_I32
+
+    Inst_VOP3__V_CMP_GT_I32::~Inst_VOP3__V_CMP_GT_I32()
+    {
+    } // ~Inst_VOP3__V_CMP_GT_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NE_I32 class methods ---
+
+    Inst_VOP3__V_CMP_NE_I32::Inst_VOP3__V_CMP_NE_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ne_i32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_NE_I32
+
+    Inst_VOP3__V_CMP_NE_I32::~Inst_VOP3__V_CMP_NE_I32()
+    {
+    } // ~Inst_VOP3__V_CMP_NE_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GE_I32 class methods ---
+
+    Inst_VOP3__V_CMP_GE_I32::Inst_VOP3__V_CMP_GE_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ge_i32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GE_I32
+
+    Inst_VOP3__V_CMP_GE_I32::~Inst_VOP3__V_CMP_GE_I32()
+    {
+    } // ~Inst_VOP3__V_CMP_GE_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_T_I32 class methods ---
+
+    Inst_VOP3__V_CMP_T_I32::Inst_VOP3__V_CMP_T_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_t_i32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_T_I32
+
+    Inst_VOP3__V_CMP_T_I32::~Inst_VOP3__V_CMP_T_I32()
+    {
+    } // ~Inst_VOP3__V_CMP_T_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_T_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_F_U32 class methods ---
+
+    Inst_VOP3__V_CMP_F_U32::Inst_VOP3__V_CMP_F_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_f_u32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_F_U32
+
+    Inst_VOP3__V_CMP_F_U32::~Inst_VOP3__V_CMP_F_U32()
+    {
+    } // ~Inst_VOP3__V_CMP_F_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_F_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LT_U32 class methods ---
+
+    Inst_VOP3__V_CMP_LT_U32::Inst_VOP3__V_CMP_LT_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lt_u32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LT_U32
+
+    Inst_VOP3__V_CMP_LT_U32::~Inst_VOP3__V_CMP_LT_U32()
+    {
+    } // ~Inst_VOP3__V_CMP_LT_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_EQ_U32 class methods ---
+
+    Inst_VOP3__V_CMP_EQ_U32::Inst_VOP3__V_CMP_EQ_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_eq_u32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_EQ_U32
+
+    Inst_VOP3__V_CMP_EQ_U32::~Inst_VOP3__V_CMP_EQ_U32()
+    {
+    } // ~Inst_VOP3__V_CMP_EQ_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LE_U32 class methods ---
+
+    Inst_VOP3__V_CMP_LE_U32::Inst_VOP3__V_CMP_LE_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_le_u32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LE_U32
+
+    Inst_VOP3__V_CMP_LE_U32::~Inst_VOP3__V_CMP_LE_U32()
+    {
+    } // ~Inst_VOP3__V_CMP_LE_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GT_U32 class methods ---
+
+    Inst_VOP3__V_CMP_GT_U32::Inst_VOP3__V_CMP_GT_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_gt_u32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GT_U32
+
+    Inst_VOP3__V_CMP_GT_U32::~Inst_VOP3__V_CMP_GT_U32()
+    {
+    } // ~Inst_VOP3__V_CMP_GT_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NE_U32 class methods ---
+
+    Inst_VOP3__V_CMP_NE_U32::Inst_VOP3__V_CMP_NE_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ne_u32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_NE_U32
+
+    Inst_VOP3__V_CMP_NE_U32::~Inst_VOP3__V_CMP_NE_U32()
+    {
+    } // ~Inst_VOP3__V_CMP_NE_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GE_U32 class methods ---
+
+    Inst_VOP3__V_CMP_GE_U32::Inst_VOP3__V_CMP_GE_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ge_u32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GE_U32
+
+    Inst_VOP3__V_CMP_GE_U32::~Inst_VOP3__V_CMP_GE_U32()
+    {
+    } // ~Inst_VOP3__V_CMP_GE_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_T_U32 class methods ---
+
+    Inst_VOP3__V_CMP_T_U32::Inst_VOP3__V_CMP_T_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_t_u32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_T_U32
+
+    Inst_VOP3__V_CMP_T_U32::~Inst_VOP3__V_CMP_T_U32()
+    {
+    } // ~Inst_VOP3__V_CMP_T_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_T_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_F_I32 class methods ---
+
+    Inst_VOP3__V_CMPX_F_I32::Inst_VOP3__V_CMPX_F_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_f_i32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_F_I32
+
+    Inst_VOP3__V_CMPX_F_I32::~Inst_VOP3__V_CMPX_F_I32()
+    {
+    } // ~Inst_VOP3__V_CMPX_F_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_F_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LT_I32 class methods ---
+
+    Inst_VOP3__V_CMPX_LT_I32::Inst_VOP3__V_CMPX_LT_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lt_i32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LT_I32
+
+    Inst_VOP3__V_CMPX_LT_I32::~Inst_VOP3__V_CMPX_LT_I32()
+    {
+    } // ~Inst_VOP3__V_CMPX_LT_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_EQ_I32 class methods ---
+
+    Inst_VOP3__V_CMPX_EQ_I32::Inst_VOP3__V_CMPX_EQ_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_eq_i32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_EQ_I32
+
+    Inst_VOP3__V_CMPX_EQ_I32::~Inst_VOP3__V_CMPX_EQ_I32()
+    {
+    } // ~Inst_VOP3__V_CMPX_EQ_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LE_I32 class methods ---
+
+    Inst_VOP3__V_CMPX_LE_I32::Inst_VOP3__V_CMPX_LE_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_le_i32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LE_I32
+
+    Inst_VOP3__V_CMPX_LE_I32::~Inst_VOP3__V_CMPX_LE_I32()
+    {
+    } // ~Inst_VOP3__V_CMPX_LE_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GT_I32 class methods ---
+
+    Inst_VOP3__V_CMPX_GT_I32::Inst_VOP3__V_CMPX_GT_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_gt_i32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GT_I32
+
+    Inst_VOP3__V_CMPX_GT_I32::~Inst_VOP3__V_CMPX_GT_I32()
+    {
+    } // ~Inst_VOP3__V_CMPX_GT_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NE_I32 class methods ---
+
+    Inst_VOP3__V_CMPX_NE_I32::Inst_VOP3__V_CMPX_NE_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ne_i32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NE_I32
+
+    Inst_VOP3__V_CMPX_NE_I32::~Inst_VOP3__V_CMPX_NE_I32()
+    {
+    } // ~Inst_VOP3__V_CMPX_NE_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GE_I32 class methods ---
+
+    Inst_VOP3__V_CMPX_GE_I32::Inst_VOP3__V_CMPX_GE_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ge_i32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GE_I32
+
+    Inst_VOP3__V_CMPX_GE_I32::~Inst_VOP3__V_CMPX_GE_I32()
+    {
+    } // ~Inst_VOP3__V_CMPX_GE_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_T_I32 class methods ---
+
+    Inst_VOP3__V_CMPX_T_I32::Inst_VOP3__V_CMPX_T_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_t_i32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_T_I32
+
+    Inst_VOP3__V_CMPX_T_I32::~Inst_VOP3__V_CMPX_T_I32()
+    {
+    } // ~Inst_VOP3__V_CMPX_T_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_T_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_F_U32 class methods ---
+
+    Inst_VOP3__V_CMPX_F_U32::Inst_VOP3__V_CMPX_F_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_f_u32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_F_U32
+
+    Inst_VOP3__V_CMPX_F_U32::~Inst_VOP3__V_CMPX_F_U32()
+    {
+    } // ~Inst_VOP3__V_CMPX_F_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_F_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LT_U32 class methods ---
+
+    Inst_VOP3__V_CMPX_LT_U32::Inst_VOP3__V_CMPX_LT_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lt_u32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LT_U32
+
+    Inst_VOP3__V_CMPX_LT_U32::~Inst_VOP3__V_CMPX_LT_U32()
+    {
+    } // ~Inst_VOP3__V_CMPX_LT_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_EQ_U32 class methods ---
+
+    Inst_VOP3__V_CMPX_EQ_U32::Inst_VOP3__V_CMPX_EQ_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_eq_u32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_EQ_U32
+
+    Inst_VOP3__V_CMPX_EQ_U32::~Inst_VOP3__V_CMPX_EQ_U32()
+    {
+    } // ~Inst_VOP3__V_CMPX_EQ_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LE_U32 class methods ---
+
+    Inst_VOP3__V_CMPX_LE_U32::Inst_VOP3__V_CMPX_LE_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_le_u32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LE_U32
+
+    Inst_VOP3__V_CMPX_LE_U32::~Inst_VOP3__V_CMPX_LE_U32()
+    {
+    } // ~Inst_VOP3__V_CMPX_LE_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GT_U32 class methods ---
+
+    Inst_VOP3__V_CMPX_GT_U32::Inst_VOP3__V_CMPX_GT_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_gt_u32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GT_U32
+
+    Inst_VOP3__V_CMPX_GT_U32::~Inst_VOP3__V_CMPX_GT_U32()
+    {
+    } // ~Inst_VOP3__V_CMPX_GT_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NE_U32 class methods ---
+
+    Inst_VOP3__V_CMPX_NE_U32::Inst_VOP3__V_CMPX_NE_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ne_u32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NE_U32
+
+    Inst_VOP3__V_CMPX_NE_U32::~Inst_VOP3__V_CMPX_NE_U32()
+    {
+    } // ~Inst_VOP3__V_CMPX_NE_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GE_U32 class methods ---
+
+    Inst_VOP3__V_CMPX_GE_U32::Inst_VOP3__V_CMPX_GE_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ge_u32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GE_U32
+
+    Inst_VOP3__V_CMPX_GE_U32::~Inst_VOP3__V_CMPX_GE_U32()
+    {
+    } // ~Inst_VOP3__V_CMPX_GE_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_T_U32 class methods ---
+
+    Inst_VOP3__V_CMPX_T_U32::Inst_VOP3__V_CMPX_T_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_t_u32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_T_U32
+
+    Inst_VOP3__V_CMPX_T_U32::~Inst_VOP3__V_CMPX_T_U32()
+    {
+    } // ~Inst_VOP3__V_CMPX_T_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_T_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_F_I64 class methods ---
+
+    Inst_VOP3__V_CMP_F_I64::Inst_VOP3__V_CMP_F_I64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_f_i64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_F_I64
+
+    Inst_VOP3__V_CMP_F_I64::~Inst_VOP3__V_CMP_F_I64()
+    {
+    } // ~Inst_VOP3__V_CMP_F_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_F_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LT_I64 class methods ---
+
+    Inst_VOP3__V_CMP_LT_I64::Inst_VOP3__V_CMP_LT_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lt_i64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LT_I64
+
+    Inst_VOP3__V_CMP_LT_I64::~Inst_VOP3__V_CMP_LT_I64()
+    {
+    } // ~Inst_VOP3__V_CMP_LT_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LT_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_EQ_I64 class methods ---
+
+    Inst_VOP3__V_CMP_EQ_I64::Inst_VOP3__V_CMP_EQ_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_eq_i64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_EQ_I64
+
+    Inst_VOP3__V_CMP_EQ_I64::~Inst_VOP3__V_CMP_EQ_I64()
+    {
+    } // ~Inst_VOP3__V_CMP_EQ_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LE_I64 class methods ---
+
+    Inst_VOP3__V_CMP_LE_I64::Inst_VOP3__V_CMP_LE_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_le_i64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LE_I64
+
+    Inst_VOP3__V_CMP_LE_I64::~Inst_VOP3__V_CMP_LE_I64()
+    {
+    } // ~Inst_VOP3__V_CMP_LE_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GT_I64 class methods ---
+
+    Inst_VOP3__V_CMP_GT_I64::Inst_VOP3__V_CMP_GT_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_gt_i64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GT_I64
+
+    Inst_VOP3__V_CMP_GT_I64::~Inst_VOP3__V_CMP_GT_I64()
+    {
+    } // ~Inst_VOP3__V_CMP_GT_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GT_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NE_I64 class methods ---
+
+    Inst_VOP3__V_CMP_NE_I64::Inst_VOP3__V_CMP_NE_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ne_i64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_NE_I64
+
+    Inst_VOP3__V_CMP_NE_I64::~Inst_VOP3__V_CMP_NE_I64()
+    {
+    } // ~Inst_VOP3__V_CMP_NE_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GE_I64 class methods ---
+
+    Inst_VOP3__V_CMP_GE_I64::Inst_VOP3__V_CMP_GE_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ge_i64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GE_I64
+
+    Inst_VOP3__V_CMP_GE_I64::~Inst_VOP3__V_CMP_GE_I64()
+    {
+    } // ~Inst_VOP3__V_CMP_GE_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_T_I64 class methods ---
+
+    Inst_VOP3__V_CMP_T_I64::Inst_VOP3__V_CMP_T_I64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_t_i64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_T_I64
+
+    Inst_VOP3__V_CMP_T_I64::~Inst_VOP3__V_CMP_T_I64()
+    {
+    } // ~Inst_VOP3__V_CMP_T_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_T_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_F_U64 class methods ---
+
+    Inst_VOP3__V_CMP_F_U64::Inst_VOP3__V_CMP_F_U64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_f_u64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_F_U64
+
+    Inst_VOP3__V_CMP_F_U64::~Inst_VOP3__V_CMP_F_U64()
+    {
+    } // ~Inst_VOP3__V_CMP_F_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_F_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LT_U64 class methods ---
+
+    Inst_VOP3__V_CMP_LT_U64::Inst_VOP3__V_CMP_LT_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lt_u64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LT_U64
+
+    Inst_VOP3__V_CMP_LT_U64::~Inst_VOP3__V_CMP_LT_U64()
+    {
+    } // ~Inst_VOP3__V_CMP_LT_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LT_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_EQ_U64 class methods ---
+
+    Inst_VOP3__V_CMP_EQ_U64::Inst_VOP3__V_CMP_EQ_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_eq_u64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_EQ_U64
+
+    Inst_VOP3__V_CMP_EQ_U64::~Inst_VOP3__V_CMP_EQ_U64()
+    {
+    } // ~Inst_VOP3__V_CMP_EQ_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LE_U64 class methods ---
+
+    Inst_VOP3__V_CMP_LE_U64::Inst_VOP3__V_CMP_LE_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_le_u64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LE_U64
+
+    Inst_VOP3__V_CMP_LE_U64::~Inst_VOP3__V_CMP_LE_U64()
+    {
+    } // ~Inst_VOP3__V_CMP_LE_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GT_U64 class methods ---
+
+    Inst_VOP3__V_CMP_GT_U64::Inst_VOP3__V_CMP_GT_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_gt_u64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GT_U64
+
+    Inst_VOP3__V_CMP_GT_U64::~Inst_VOP3__V_CMP_GT_U64()
+    {
+    } // ~Inst_VOP3__V_CMP_GT_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GT_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NE_U64 class methods ---
+
+    Inst_VOP3__V_CMP_NE_U64::Inst_VOP3__V_CMP_NE_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ne_u64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_NE_U64
+
+    Inst_VOP3__V_CMP_NE_U64::~Inst_VOP3__V_CMP_NE_U64()
+    {
+    } // ~Inst_VOP3__V_CMP_NE_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GE_U64 class methods ---
+
+    Inst_VOP3__V_CMP_GE_U64::Inst_VOP3__V_CMP_GE_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ge_u64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GE_U64
+
+    Inst_VOP3__V_CMP_GE_U64::~Inst_VOP3__V_CMP_GE_U64()
+    {
+    } // ~Inst_VOP3__V_CMP_GE_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_T_U64 class methods ---
+
+    Inst_VOP3__V_CMP_T_U64::Inst_VOP3__V_CMP_T_U64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_t_u64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_T_U64
+
+    Inst_VOP3__V_CMP_T_U64::~Inst_VOP3__V_CMP_T_U64()
+    {
+    } // ~Inst_VOP3__V_CMP_T_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_T_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_F_I64 class methods ---
+
+    Inst_VOP3__V_CMPX_F_I64::Inst_VOP3__V_CMPX_F_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_f_i64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_F_I64
+
+    Inst_VOP3__V_CMPX_F_I64::~Inst_VOP3__V_CMPX_F_I64()
+    {
+    } // ~Inst_VOP3__V_CMPX_F_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_F_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LT_I64 class methods ---
+
+    Inst_VOP3__V_CMPX_LT_I64::Inst_VOP3__V_CMPX_LT_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lt_i64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LT_I64
+
+    Inst_VOP3__V_CMPX_LT_I64::~Inst_VOP3__V_CMPX_LT_I64()
+    {
+    } // ~Inst_VOP3__V_CMPX_LT_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LT_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_EQ_I64 class methods ---
+
+    Inst_VOP3__V_CMPX_EQ_I64::Inst_VOP3__V_CMPX_EQ_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_eq_i64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_EQ_I64
+
+    Inst_VOP3__V_CMPX_EQ_I64::~Inst_VOP3__V_CMPX_EQ_I64()
+    {
+    } // ~Inst_VOP3__V_CMPX_EQ_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LE_I64 class methods ---
+
+    Inst_VOP3__V_CMPX_LE_I64::Inst_VOP3__V_CMPX_LE_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_le_i64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LE_I64
+
+    Inst_VOP3__V_CMPX_LE_I64::~Inst_VOP3__V_CMPX_LE_I64()
+    {
+    } // ~Inst_VOP3__V_CMPX_LE_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GT_I64 class methods ---
+
+    Inst_VOP3__V_CMPX_GT_I64::Inst_VOP3__V_CMPX_GT_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_gt_i64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GT_I64
+
+    Inst_VOP3__V_CMPX_GT_I64::~Inst_VOP3__V_CMPX_GT_I64()
+    {
+    } // ~Inst_VOP3__V_CMPX_GT_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GT_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NE_I64 class methods ---
+
+    Inst_VOP3__V_CMPX_NE_I64::Inst_VOP3__V_CMPX_NE_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ne_i64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NE_I64
+
+    Inst_VOP3__V_CMPX_NE_I64::~Inst_VOP3__V_CMPX_NE_I64()
+    {
+    } // ~Inst_VOP3__V_CMPX_NE_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GE_I64 class methods ---
+
+    Inst_VOP3__V_CMPX_GE_I64::Inst_VOP3__V_CMPX_GE_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ge_i64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GE_I64
+
+    Inst_VOP3__V_CMPX_GE_I64::~Inst_VOP3__V_CMPX_GE_I64()
+    {
+    } // ~Inst_VOP3__V_CMPX_GE_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_T_I64 class methods ---
+
+    Inst_VOP3__V_CMPX_T_I64::Inst_VOP3__V_CMPX_T_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_t_i64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_T_I64
+
+    Inst_VOP3__V_CMPX_T_I64::~Inst_VOP3__V_CMPX_T_I64()
+    {
+    } // ~Inst_VOP3__V_CMPX_T_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_T_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_F_U64 class methods ---
+
+    Inst_VOP3__V_CMPX_F_U64::Inst_VOP3__V_CMPX_F_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_f_u64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_F_U64
+
+    Inst_VOP3__V_CMPX_F_U64::~Inst_VOP3__V_CMPX_F_U64()
+    {
+    } // ~Inst_VOP3__V_CMPX_F_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_F_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LT_U64 class methods ---
+
+    Inst_VOP3__V_CMPX_LT_U64::Inst_VOP3__V_CMPX_LT_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lt_u64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LT_U64
+
+    Inst_VOP3__V_CMPX_LT_U64::~Inst_VOP3__V_CMPX_LT_U64()
+    {
+    } // ~Inst_VOP3__V_CMPX_LT_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LT_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_EQ_U64 class methods ---
+
+    Inst_VOP3__V_CMPX_EQ_U64::Inst_VOP3__V_CMPX_EQ_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_eq_u64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_EQ_U64
+
+    Inst_VOP3__V_CMPX_EQ_U64::~Inst_VOP3__V_CMPX_EQ_U64()
+    {
+    } // ~Inst_VOP3__V_CMPX_EQ_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LE_U64 class methods ---
+
+    Inst_VOP3__V_CMPX_LE_U64::Inst_VOP3__V_CMPX_LE_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_le_u64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LE_U64
+
+    Inst_VOP3__V_CMPX_LE_U64::~Inst_VOP3__V_CMPX_LE_U64()
+    {
+    } // ~Inst_VOP3__V_CMPX_LE_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GT_U64 class methods ---
+
+    Inst_VOP3__V_CMPX_GT_U64::Inst_VOP3__V_CMPX_GT_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_gt_u64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GT_U64
+
+    Inst_VOP3__V_CMPX_GT_U64::~Inst_VOP3__V_CMPX_GT_U64()
+    {
+    } // ~Inst_VOP3__V_CMPX_GT_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GT_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NE_U64 class methods ---
+
+    Inst_VOP3__V_CMPX_NE_U64::Inst_VOP3__V_CMPX_NE_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ne_u64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NE_U64
+
+    Inst_VOP3__V_CMPX_NE_U64::~Inst_VOP3__V_CMPX_NE_U64()
+    {
+    } // ~Inst_VOP3__V_CMPX_NE_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GE_U64 class methods ---
+
+    Inst_VOP3__V_CMPX_GE_U64::Inst_VOP3__V_CMPX_GE_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ge_u64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GE_U64
+
+    Inst_VOP3__V_CMPX_GE_U64::~Inst_VOP3__V_CMPX_GE_U64()
+    {
+    } // ~Inst_VOP3__V_CMPX_GE_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_T_U64 class methods ---
+
+    Inst_VOP3__V_CMPX_T_U64::Inst_VOP3__V_CMPX_T_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_t_u64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_T_U64
+
+    Inst_VOP3__V_CMPX_T_U64::~Inst_VOP3__V_CMPX_T_U64()
+    {
+    } // ~Inst_VOP3__V_CMPX_T_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_T_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/vop3p.cc b/src/arch/amdgpu/vega/insts/vop3p.cc
index eddb1e7ad5..85f0af2a51 100644
--- a/src/arch/amdgpu/vega/insts/vop3p.cc
+++ b/src/arch/amdgpu/vega/insts/vop3p.cc
@@ -31,6 +31,7 @@
 
 #include "arch/amdgpu/vega/insts/vop3p.hh"
 
+#include "arch/amdgpu/vega/insts/instructions.hh"
 #include "arch/arm/insts/fplib.hh"
 
 namespace gem5
@@ -631,5 +632,236 @@ void Inst_VOP3P__V_ACCVGPR_WRITE::execute(GPUDynInstPtr gpuDynInst)
     vdst.write();
 }
 
+// --- Inst_VOP3P__V_PK_FMA_F32 class methods ---
+
+Inst_VOP3P__V_PK_FMA_F32::Inst_VOP3P__V_PK_FMA_F32(InFmt_VOP3P *iFmt)
+    : Inst_VOP3P(iFmt, "v_pk_fma_f32")
+{
+    setFlag(ALU);
+} // Inst_VOP3P__V_PK_FMA_F32
+
+Inst_VOP3P__V_PK_FMA_F32::~Inst_VOP3P__V_PK_FMA_F32()
+{
+} // ~Inst_VOP3P__V_PK_FMA_F32
+
+// D.f[63:32] = S0.f[63:32] * S1.f[63:32] + S2.f[63:32] . D.f[31:0] =
+//     S0.f[31:0] * S1.f[31:0] + S2.f[31:0] .
+void
+Inst_VOP3P__V_PK_FMA_F32::execute(GPUDynInstPtr gpuDynInst)
+{
+    // This is a special case of packed instructions which operates on
+    // 64-bit inputs/outputs and not 32-bit. U64 is used here as float
+    // values cannot use bitwise operations. Consider the U64 to imply
+    // untyped 64-bits of data.
+    Wavefront *wf = gpuDynInst->wavefront();
+    ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+    ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+    ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
+    VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+    src0.readSrc();
+    src1.readSrc();
+    src2.readSrc();
+
+    int opsel = instData.OPSEL;
+    int opsel_hi = extData.OPSEL_HI | (instData.OPSEL_HI2 << 2);
+
+    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+        if (wf->execMask(lane)) {
+            uint32_t s0l = (opsel & 1) ? bits(src0[lane], 63, 32)
+                                       : bits(src0[lane], 31, 0);
+            uint32_t s1l = (opsel & 2) ? bits(src1[lane], 63, 32)
+                                       : bits(src1[lane], 31, 0);
+            uint32_t s2l = (opsel & 4) ? bits(src2[lane], 63, 32)
+                                       : bits(src2[lane], 31, 0);
+
+            float dword1 = std::fma(*reinterpret_cast<float*>(&s0l),
+                                    *reinterpret_cast<float*>(&s1l),
+                                    *reinterpret_cast<float*>(&s2l));
+
+            uint32_t s0h = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
+                                          : bits(src0[lane], 31, 0);
+            uint32_t s1h = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
+                                          : bits(src1[lane], 31, 0);
+            uint32_t s2h = (opsel_hi & 4) ? bits(src2[lane], 63, 32)
+                                          : bits(src2[lane], 31, 0);
+
+            float dword2 = std::fma(*reinterpret_cast<float*>(&s0h),
+                                    *reinterpret_cast<float*>(&s1h),
+                                    *reinterpret_cast<float*>(&s2h));
+
+            uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
+            uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
+
+            vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;
+        }
+    }
+
+    vdst.write();
+} // execute
+// --- Inst_VOP3P__V_PK_MUL_F32 class methods ---
+
+Inst_VOP3P__V_PK_MUL_F32::Inst_VOP3P__V_PK_MUL_F32(InFmt_VOP3P *iFmt)
+    : Inst_VOP3P(iFmt, "v_pk_mul_f32")
+{
+    setFlag(ALU);
+} // Inst_VOP3P__V_PK_MUL_F32
+
+Inst_VOP3P__V_PK_MUL_F32::~Inst_VOP3P__V_PK_MUL_F32()
+{
+} // ~Inst_VOP3P__V_PK_MUL_F32
+
+// D.f[63:32] = S0.f[63:32] * S1.f[63:32] . D.f[31:0] = S0.f[31:0] *
+//              S1.f[31:0]
+void
+Inst_VOP3P__V_PK_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
+{
+    // This is a special case of packed instructions which operates on
+    // 64-bit inputs/outputs and not 32-bit. U64 is used here as float
+    // values cannot use bitwise operations. Consider the U64 to imply
+    // untyped 64-bits of data.
+    Wavefront *wf = gpuDynInst->wavefront();
+    ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+    ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+    VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+    src0.readSrc();
+    src1.readSrc();
+
+    int opsel = instData.OPSEL;
+    int opsel_hi = extData.OPSEL_HI;
+
+    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+        if (wf->execMask(lane)) {
+            uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
+                                               : bits(src0[lane], 31, 0);
+            uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
+                                               : bits(src1[lane], 31, 0);
+
+            float dword1 = *reinterpret_cast<float*>(&lower_dword)
+                         * *reinterpret_cast<float*>(&upper_dword);
+
+            lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
+                                         : bits(src0[lane], 31, 0);
+            upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
+                                         : bits(src1[lane], 31, 0);
+
+            float dword2 = *reinterpret_cast<float*>(&lower_dword)
+                         * *reinterpret_cast<float*>(&upper_dword);
+
+            uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
+            uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
+
+            vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;
+        }
+    }
+
+    vdst.write();
+} // execute
+// --- Inst_VOP3P__V_PK_ADD_F32 class methods ---
+
+Inst_VOP3P__V_PK_ADD_F32::Inst_VOP3P__V_PK_ADD_F32(InFmt_VOP3P *iFmt)
+    : Inst_VOP3P(iFmt, "v_pk_add_f32")
+{
+    setFlag(ALU);
+} // Inst_VOP3P__V_PK_ADD_F32
+
+Inst_VOP3P__V_PK_ADD_F32::~Inst_VOP3P__V_PK_ADD_F32()
+{
+} // ~Inst_VOP3P__V_PK_ADD_F32
+
+// D.f[63:32] = S0.f[63:32] + S1.f[63:32] . D.f[31:0] = S0.f[31:0] +
+//              S1.f[31:0]
+void
+Inst_VOP3P__V_PK_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
+{
+    // This is a special case of packed instructions which operates on
+    // 64-bit inputs/outputs and not 32-bit. U64 is used here as float
+    // values cannot use bitwise operations. Consider the U64 to imply
+    // untyped 64-bits of data.
+    Wavefront *wf = gpuDynInst->wavefront();
+    ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+    ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+    VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+    src0.readSrc();
+    src1.readSrc();
+
+    int opsel = instData.OPSEL;
+    int opsel_hi = extData.OPSEL_HI;
+
+    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+        if (wf->execMask(lane)) {
+            uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
+                                               : bits(src0[lane], 31, 0);
+            uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
+                                               : bits(src1[lane], 31, 0);
+
+            float dword1 = *reinterpret_cast<float*>(&lower_dword)
+                         + *reinterpret_cast<float*>(&upper_dword);
+
+            lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
+                                         : bits(src0[lane], 31, 0);
+            upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
+                                         : bits(src1[lane], 31, 0);
+
+            float dword2 = *reinterpret_cast<float*>(&lower_dword)
+                         + *reinterpret_cast<float*>(&upper_dword);
+
+            uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
+            uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
+
+            vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;
+        }
+    }
+
+    vdst.write();
+} // execute
+// --- Inst_VOP3P__V_PK_MOV_B32 class methods ---
+
+Inst_VOP3P__V_PK_MOV_B32::Inst_VOP3P__V_PK_MOV_B32(InFmt_VOP3P *iFmt)
+    : Inst_VOP3P(iFmt, "v_pk_mov_b32")
+{
+    setFlag(ALU);
+} // Inst_VOP3P__V_PK_MOV_B32
+
+Inst_VOP3P__V_PK_MOV_B32::~Inst_VOP3P__V_PK_MOV_B32()
+{
+} // ~Inst_VOP3P__V_PK_MOV_B32
+
+// D.u[63:32] = S1.u[31:0]; D.u[31:0] = S0.u[31:0].
+void
+Inst_VOP3P__V_PK_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
+{
+    // This is a special case of packed instructions which operates on
+    // 64-bit inputs/outputs and not 32-bit.
+    Wavefront *wf = gpuDynInst->wavefront();
+    ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+    ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+    VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+    src0.readSrc();
+    src1.readSrc();
+
+    // Only OPSEL[1:0] are used
+    // OPSEL[0] 0/1: Lower dest dword = lower/upper dword of src0
+
+    int opsel = instData.OPSEL;
+
+    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+        if (wf->execMask(lane)) {
+            // OPSEL[1] 0/1: Lower dest dword = lower/upper dword of src1
+            uint64_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
+                                               : bits(src0[lane], 31, 0);
+            uint64_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
+                                               : bits(src1[lane], 31, 0);
+
+            vdst[lane] = upper_dword << 32 | lower_dword;
+        }
+    }
+
+    vdst.write();
+} // execute
+
 } // namespace VegaISA
 } // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/vop3p_mai.cc b/src/arch/amdgpu/vega/insts/vop3p_mai.cc
new file mode 100644
index 0000000000..943aa72cfd
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/vop3p_mai.cc
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+#include "arch/amdgpu/vega/insts/vop3p.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8 class methods ---
+
+    Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8::
+        Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8(InFmt_VOP3P_MAI *iFmt)
+        : Inst_VOP3P_MAI(iFmt, "v_mfma_i32_16x16x16i8")
+    {
+        setFlag(ALU);
+    } // Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8
+
+    Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8::
+        ~Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8()
+    {
+    } // ~Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8
+
+    // D(16x16I32) = A(16x16I8) x B(16x16I8) + C(16x16I32), 1 Blocks, 8
+    // pass, srcA/srcB 1 archVgpr, srcC/D 4 accVGPR
+    void
+    Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        int acc_offset = 0;
+        if (instData.ACC_CD) {
+            warn("ACC_CD not yet implemented\n");
+        }
+
+        // int8 size allows for 4 elements per lane. At 16x16 this means 4
+        // lanes per column (A matrix) / (B matrix). This whole matrix fits
+        // in one VGPR. The C matrix with size int32 requires 4 VGPRs.
+        // Handle the C matrix by using a delta. This is set to 1 normally to
+        // move to the next VGPR (1 dword away) and 0 if the input is a scalar
+        // reg (e.g., a constant).
+        int delta = isVectorReg(extData.SRC2) ? 1 : 0;
+
+        // VecOperandI8 will read 8 bits and sign extend, so used U32 to read
+        // as "untyped" 32-bit values.
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandI32 src2a(gpuDynInst, extData.SRC2+acc_offset);
+        ConstVecOperandI32 src2b(gpuDynInst, extData.SRC2+acc_offset+1*delta);
+        ConstVecOperandI32 src2c(gpuDynInst, extData.SRC2+acc_offset+2*delta);
+        ConstVecOperandI32 src2d(gpuDynInst, extData.SRC2+acc_offset+3*delta);
+
+        VecOperandI32 vdsta(gpuDynInst, instData.VDST+acc_offset);
+        VecOperandI32 vdstb(gpuDynInst, instData.VDST+acc_offset+1);
+        VecOperandI32 vdstc(gpuDynInst, instData.VDST+acc_offset+2);
+        VecOperandI32 vdstd(gpuDynInst, instData.VDST+acc_offset+3);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2a.readSrc();
+        src2b.readSrc();
+        src2c.readSrc();
+        src2d.readSrc();
+
+        int32_t A[16][16];
+        for (int i = 0; i < 64; ++i) {
+            // src0[0:15] contains columns 1 - 4 packed for rows 0 - 15,
+            // src0[16:31] contains columns 5 - 8 packed for rows 0 - 15,
+            // src0[32:47] contains columns 9 - 12 packed for rows 0 - 15,
+            // src0[48:63] contains columns 13 - 16 packed for rows 0 - 15,
+            int row = i % 16;
+            int start_col = (i / 16) * 4;
+
+            A[row][start_col+0] = sext<8>(bits(src0[i], 7, 0));
+            A[row][start_col+1] = sext<8>(bits(src0[i], 15, 8));
+            A[row][start_col+2] = sext<8>(bits(src0[i], 23, 16));
+            A[row][start_col+3] = sext<8>(bits(src0[i], 31, 24));
+        }
+
+        int32_t B[16][16];
+        for (int i = 0; i < 64; ++i) {
+            // src1[0:15] contains rows 1 - 4 packed for columns 0 - 15
+            // src1[16:31] contains rows 5 - 8 packed for columns 0 - 15
+            // src1[32:47] contains rows 9 - 12 packed for columns 0 - 15
+            // src1[48:63] contains rows 13 - 16 packed for columns 0 - 15
+            int start_row = (i / 16) * 4;
+            int col = i % 16;
+
+            B[start_row+0][col] = sext<8>(bits(src1[i], 7, 0));
+            B[start_row+1][col] = sext<8>(bits(src1[i], 15, 8));
+            B[start_row+2][col] = sext<8>(bits(src1[i], 23, 16));
+            B[start_row+3][col] = sext<8>(bits(src1[i], 31, 24));
+        }
+
+        int32_t result[16][16];
+
+        // Load accumulation matrix C into result
+        for (int i = 0; i < 64; ++i) {
+            // src2a contains rows 0, 4, 8, 12
+            result[(i/16)*4][(i%16)] = src2a[i];
+            // src2b contains rows 1, 5, 9, 13
+            result[(i/16)*4+1][(i%16)] = src2b[i];
+            // src2c contains rows 2, 6, 10, 14
+            result[(i/16)*4+2][(i%16)] = src2c[i];
+            // src2d contains rows 3, 7, 11, 15
+            result[(i/16)*4+3][(i%16)] = src2d[i];
+        }
+
+        // Compute new result - This is (obviously) not optimized
+        for (int i = 0; i < 16; ++i) {
+            for (int j = 0; j < 16; ++j) {
+                for (int k = 0; k < 16; ++k) {
+                    result[i][j] += A[i][k] * B[k][j];
+                }
+            }
+        }
+
+        // Put result in dest VGPRs
+        for (int i = 0; i < 64; ++i) {
+            // vdsta contains rows 0, 4, 8, 12
+            vdsta[i] = result[(i/16)*4][(i%16)];
+            // vdstb contains rows 1, 5, 9, 13
+            vdstb[i] = result[(i/16)*4+1][(i%16)];
+            // vdstc contains rows 2, 6, 10, 14
+            vdstc[i] = result[(i/16)*4+2][(i%16)];
+            // vdstd contains rows 3, 7, 11, 15
+            vdstd[i] = result[(i/16)*4+3][(i%16)];
+        }
+
+        vdsta.write();
+        vdstb.write();
+        vdstc.write();
+        vdstd.write();
+    } // execute
+    // --- Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64 class methods ---
+
+    Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64::
+        Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64(InFmt_VOP3P_MAI *iFmt)
+        : Inst_VOP3P_MAI(iFmt, "v_mfma_f64_16x16x4f64")
+    {
+        setFlag(ALU);
+    } // Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64
+
+    Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64::
+        ~Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64()
+    {
+    } // ~Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64
+
+    // D(16x16F64) = A(16x4F64) x B(4x16F64) + C(16x16F64), 1 Blocks, 8
+    // pass, srcA/srcB 2 VGPR, srcC/D 8 VGPR
+    void
+    Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        int acc_offset = 0;
+        if (instData.ACC_CD) {
+            warn("ACC_CD not yet implemented\n");
+        }
+
+        // Handling of src2 is a bit tricky. The operator[] overload cannot
+        // be used for dword count > 2, and the dword count here is 8. Usually
+        // src2 is a VGPR/AccGPR, but it might also be constant. In order to
+        // use operator[] and handle constants, check for VGPR here and set
+        // a delta for each of the pairs of src2 GPRs.
+        int delta = isVectorReg(extData.SRC2) ? 2 : 0;
+
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF64 src2a(gpuDynInst, extData.SRC2+acc_offset);
+        ConstVecOperandF64 src2b(gpuDynInst, extData.SRC2+acc_offset+1*delta);
+        ConstVecOperandF64 src2c(gpuDynInst, extData.SRC2+acc_offset+2*delta);
+        ConstVecOperandF64 src2d(gpuDynInst, extData.SRC2+acc_offset+3*delta);
+
+        VecOperandF64 vdsta(gpuDynInst, instData.VDST+acc_offset);
+        VecOperandF64 vdstb(gpuDynInst, instData.VDST+acc_offset+2);
+        VecOperandF64 vdstc(gpuDynInst, instData.VDST+acc_offset+4);
+        VecOperandF64 vdstd(gpuDynInst, instData.VDST+acc_offset+6);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2a.readSrc();
+        src2b.readSrc();
+        src2c.readSrc();
+        src2d.readSrc();
+
+        double result[16][16];
+
+        // Load src2 into result. src2 is row major
+        for (int i = 0; i < 64; ++i) {
+            // src2a contains rows 0 - 3
+            result[(i/16)][(i%16)] = src2a[i];
+            // src2b contains rows 4 - 7
+            result[(i/16)+4][(i%16)] = src2b[i];
+            // src2c contains rows 8 - 11
+            result[(i/16)+8][(i%16)] = src2c[i];
+            // src2d contains rows 12 - 15
+            result[(i/16)+12][(i%16)] = src2d[i];
+        }
+
+        // Compute new result
+        for (int i = 0; i < 16; ++i) {
+            for (int j = 0; j < 16; ++j) {
+                for (int k = 0; k < 4; ++k) {
+                    // src0 is column major, src1 is row major
+                    int lane_A = 16*k + i;
+                    int lane_B = 16*k + j;
+                    result[i][j] += src0[lane_A] * src1[lane_B];
+                }
+            }
+        }
+
+        // Put result in dest VGPRs
+        for (int i = 0; i < 64; ++i) {
+            // vdsta contains rows 0 - 3
+            vdsta[i] = result[(i/16)][(i%16)];
+            // src2b contains rows 4 - 7
+            vdstb[i] = result[(i/16)+4][(i%16)];
+            // src2c contains rows 8 - 11
+            vdstc[i] = result[(i/16)+8][(i%16)];
+            // src2d contains rows 12 - 15
+            vdstd[i] = result[(i/16)+12][(i%16)];
+        }
+
+        vdsta.write();
+        vdstb.write();
+        vdstc.write();
+        vdstd.write();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/vopc.cc b/src/arch/amdgpu/vega/insts/vopc.cc
new file mode 100644
index 0000000000..2c386fec74
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/vopc.cc
@@ -0,0 +1,6590 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_VOPC__V_CMP_CLASS_F32 class methods ---
+
+    Inst_VOPC__V_CMP_CLASS_F32::Inst_VOPC__V_CMP_CLASS_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_class_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_CLASS_F32
+
+    Inst_VOPC__V_CMP_CLASS_F32::~Inst_VOPC__V_CMP_CLASS_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_CLASS_F32
+
+    // --- description from .arch file ---
+    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f
+    // The function reports true if the floating point value is *any* of the
+    // ---  numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOPC__V_CMP_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
+                    // is NaN
+                    if (std::isnan(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 2)) {
+                    // is -infinity
+                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 3)) {
+                    // is -normal
+                    if (std::isnormal(src0[lane])
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 4)) {
+                    // is -denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 5)) {
+                    // is -zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 6)) {
+                    // is +zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 7)) {
+                    // is +denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 8)) {
+                    // is +normal
+                    if (std::isnormal(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 9)) {
+                    // is +infinity
+                    if (std::isinf(src0[lane]) && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_CLASS_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_CLASS_F32::Inst_VOPC__V_CMPX_CLASS_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_class_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_CLASS_F32
+
+    Inst_VOPC__V_CMPX_CLASS_F32::~Inst_VOPC__V_CMPX_CLASS_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_CLASS_F32
+
+    // --- description from .arch file ---
+    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
+    // S0.f The function reports true if the floating point value is *any* of
+    // the numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOPC__V_CMPX_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
+                    // is NaN
+                    if (std::isnan(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 2)) {
+                    // is -infinity
+                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 3)) {
+                    // is -normal
+                    if (std::isnormal(src0[lane])
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 4)) {
+                    // is -denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 5)) {
+                    // is -zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 6)) {
+                    // is +zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 7)) {
+                    // is +denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 8)) {
+                    // is +normal
+                    if (std::isnormal(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 9)) {
+                    // is +infinity
+                    if (std::isinf(src0[lane]) && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMP_CLASS_F64 class methods ---
+
+    Inst_VOPC__V_CMP_CLASS_F64::Inst_VOPC__V_CMP_CLASS_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_class_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_CLASS_F64
+
+    Inst_VOPC__V_CMP_CLASS_F64::~Inst_VOPC__V_CMP_CLASS_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_CLASS_F64
+
+    // --- description from .arch file ---
+    // VCC = IEEE numeric class function specified in S1.u, performed on S0.d
+    // The function reports true if the floating point value is *any* of the
+    // ---  numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOPC__V_CMP_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
+                    // is NaN
+                    if (std::isnan(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 2)) {
+                    // is -infinity
+                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 3)) {
+                    // is -normal
+                    if (std::isnormal(src0[lane])
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 4)) {
+                    // is -denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 5)) {
+                    // is -zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 6)) {
+                    // is +zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 7)) {
+                    // is +denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 8)) {
+                    // is +normal
+                    if (std::isnormal(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 9)) {
+                    // is +infinity
+                    if (std::isinf(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_CLASS_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_CLASS_F64::Inst_VOPC__V_CMPX_CLASS_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_class_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_CLASS_F64
+
+    Inst_VOPC__V_CMPX_CLASS_F64::~Inst_VOPC__V_CMPX_CLASS_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_CLASS_F64
+
+    // --- description from .arch file ---
+    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
+    // S0.d The function reports true if the floating point value is *any* of
+    // the numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOPC__V_CMPX_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
+                    // is NaN
+                    if (std::isnan(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 2)) {
+                    // is -infinity
+                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 3)) {
+                    // is -normal
+                    if (std::isnormal(src0[lane])
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 4)) {
+                    // is -denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 5)) {
+                    // is -zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 6)) {
+                    // is +zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 7)) {
+                    // is +denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 8)) {
+                    // is +normal
+                    if (std::isnormal(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 9)) {
+                    // is +infinity
+                    if (std::isinf(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMP_CLASS_F16 class methods ---
+
+    Inst_VOPC__V_CMP_CLASS_F16::Inst_VOPC__V_CMP_CLASS_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_class_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_CLASS_F16
+
+    Inst_VOPC__V_CMP_CLASS_F16::~Inst_VOPC__V_CMP_CLASS_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_CLASS_F16
+
+    // --- description from .arch file ---
+    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f16
+    // The function reports true if the floating point value is *any* of the
+    // ---  numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOPC__V_CMP_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_CLASS_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_CLASS_F16::Inst_VOPC__V_CMPX_CLASS_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_class_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_CLASS_F16
+
+    Inst_VOPC__V_CMPX_CLASS_F16::~Inst_VOPC__V_CMPX_CLASS_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_CLASS_F16
+
+    // --- description from .arch file ---
+    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
+    // ---  S0.f16
+    // The function reports true if the floating point value is *any* of the
+    // ---  numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOPC__V_CMPX_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_F_F16 class methods ---
+
+    Inst_VOPC__V_CMP_F_F16::Inst_VOPC__V_CMP_F_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_f_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_F_F16
+
+    Inst_VOPC__V_CMP_F_F16::~Inst_VOPC__V_CMP_F_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_F_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_F_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LT_F16 class methods ---
+
+    Inst_VOPC__V_CMP_LT_F16::Inst_VOPC__V_CMP_LT_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lt_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_LT_F16
+
+    Inst_VOPC__V_CMP_LT_F16::~Inst_VOPC__V_CMP_LT_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_LT_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_EQ_F16 class methods ---
+
+    Inst_VOPC__V_CMP_EQ_F16::Inst_VOPC__V_CMP_EQ_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_eq_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_EQ_F16
+
+    Inst_VOPC__V_CMP_EQ_F16::~Inst_VOPC__V_CMP_EQ_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_EQ_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LE_F16 class methods ---
+
+    Inst_VOPC__V_CMP_LE_F16::Inst_VOPC__V_CMP_LE_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_le_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_LE_F16
+
+    Inst_VOPC__V_CMP_LE_F16::~Inst_VOPC__V_CMP_LE_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_LE_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GT_F16 class methods ---
+
+    Inst_VOPC__V_CMP_GT_F16::Inst_VOPC__V_CMP_GT_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_gt_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_GT_F16
+
+    Inst_VOPC__V_CMP_GT_F16::~Inst_VOPC__V_CMP_GT_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_GT_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LG_F16 class methods ---
+
+    Inst_VOPC__V_CMP_LG_F16::Inst_VOPC__V_CMP_LG_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lg_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_LG_F16
+
+    Inst_VOPC__V_CMP_LG_F16::~Inst_VOPC__V_CMP_LG_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_LG_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GE_F16 class methods ---
+
+    Inst_VOPC__V_CMP_GE_F16::Inst_VOPC__V_CMP_GE_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ge_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_GE_F16
+
+    Inst_VOPC__V_CMP_GE_F16::~Inst_VOPC__V_CMP_GE_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_GE_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_O_F16 class methods ---
+
+    Inst_VOPC__V_CMP_O_F16::Inst_VOPC__V_CMP_O_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_o_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_O_F16
+
+    Inst_VOPC__V_CMP_O_F16::~Inst_VOPC__V_CMP_O_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_O_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_O_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_U_F16 class methods ---
+
+    Inst_VOPC__V_CMP_U_F16::Inst_VOPC__V_CMP_U_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_u_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_U_F16
+
+    Inst_VOPC__V_CMP_U_F16::~Inst_VOPC__V_CMP_U_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_U_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_U_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NGE_F16 class methods ---
+
+    Inst_VOPC__V_CMP_NGE_F16::Inst_VOPC__V_CMP_NGE_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nge_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_NGE_F16
+
+    Inst_VOPC__V_CMP_NGE_F16::~Inst_VOPC__V_CMP_NGE_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_NGE_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NLG_F16 class methods ---
+
+    Inst_VOPC__V_CMP_NLG_F16::Inst_VOPC__V_CMP_NLG_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nlg_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_NLG_F16
+
+    Inst_VOPC__V_CMP_NLG_F16::~Inst_VOPC__V_CMP_NLG_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_NLG_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NGT_F16 class methods ---
+
+    Inst_VOPC__V_CMP_NGT_F16::Inst_VOPC__V_CMP_NGT_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ngt_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_NGT_F16
+
+    Inst_VOPC__V_CMP_NGT_F16::~Inst_VOPC__V_CMP_NGT_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_NGT_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NLE_F16 class methods ---
+
+    Inst_VOPC__V_CMP_NLE_F16::Inst_VOPC__V_CMP_NLE_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nle_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_NLE_F16
+
+    Inst_VOPC__V_CMP_NLE_F16::~Inst_VOPC__V_CMP_NLE_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_NLE_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NEQ_F16 class methods ---
+
+    Inst_VOPC__V_CMP_NEQ_F16::Inst_VOPC__V_CMP_NEQ_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_neq_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_NEQ_F16
+
+    Inst_VOPC__V_CMP_NEQ_F16::~Inst_VOPC__V_CMP_NEQ_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_NEQ_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NLT_F16 class methods ---
+
+    Inst_VOPC__V_CMP_NLT_F16::Inst_VOPC__V_CMP_NLT_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nlt_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_NLT_F16
+
+    Inst_VOPC__V_CMP_NLT_F16::~Inst_VOPC__V_CMP_NLT_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_NLT_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_TRU_F16 class methods ---
+
+    Inst_VOPC__V_CMP_TRU_F16::Inst_VOPC__V_CMP_TRU_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_tru_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_TRU_F16
+
+    Inst_VOPC__V_CMP_TRU_F16::~Inst_VOPC__V_CMP_TRU_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_TRU_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_F_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_F_F16::Inst_VOPC__V_CMPX_F_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_f_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_F_F16
+
+    Inst_VOPC__V_CMPX_F_F16::~Inst_VOPC__V_CMPX_F_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_F_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_F_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LT_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_LT_F16::Inst_VOPC__V_CMPX_LT_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lt_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LT_F16
+
+    Inst_VOPC__V_CMPX_LT_F16::~Inst_VOPC__V_CMPX_LT_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_LT_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_EQ_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_EQ_F16::Inst_VOPC__V_CMPX_EQ_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_eq_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_EQ_F16
+
+    Inst_VOPC__V_CMPX_EQ_F16::~Inst_VOPC__V_CMPX_EQ_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_EQ_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LE_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_LE_F16::Inst_VOPC__V_CMPX_LE_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_le_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LE_F16
+
+    Inst_VOPC__V_CMPX_LE_F16::~Inst_VOPC__V_CMPX_LE_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_LE_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GT_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_GT_F16::Inst_VOPC__V_CMPX_GT_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_gt_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GT_F16
+
+    Inst_VOPC__V_CMPX_GT_F16::~Inst_VOPC__V_CMPX_GT_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_GT_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LG_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_LG_F16::Inst_VOPC__V_CMPX_LG_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lg_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LG_F16
+
+    Inst_VOPC__V_CMPX_LG_F16::~Inst_VOPC__V_CMPX_LG_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_LG_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GE_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_GE_F16::Inst_VOPC__V_CMPX_GE_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ge_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GE_F16
+
+    Inst_VOPC__V_CMPX_GE_F16::~Inst_VOPC__V_CMPX_GE_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_GE_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_O_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_O_F16::Inst_VOPC__V_CMPX_O_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_o_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_O_F16
+
+    Inst_VOPC__V_CMPX_O_F16::~Inst_VOPC__V_CMPX_O_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_O_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOPC__V_CMPX_O_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_U_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_U_F16::Inst_VOPC__V_CMPX_U_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_u_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_U_F16
+
+    Inst_VOPC__V_CMPX_U_F16::~Inst_VOPC__V_CMPX_U_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_U_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOPC__V_CMPX_U_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NGE_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_NGE_F16::Inst_VOPC__V_CMPX_NGE_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nge_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NGE_F16
+
+    Inst_VOPC__V_CMPX_NGE_F16::~Inst_VOPC__V_CMPX_NGE_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_NGE_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NLG_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_NLG_F16::Inst_VOPC__V_CMPX_NLG_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nlg_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NLG_F16
+
+    Inst_VOPC__V_CMPX_NLG_F16::~Inst_VOPC__V_CMPX_NLG_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_NLG_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NGT_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_NGT_F16::Inst_VOPC__V_CMPX_NGT_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ngt_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NGT_F16
+
+    Inst_VOPC__V_CMPX_NGT_F16::~Inst_VOPC__V_CMPX_NGT_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_NGT_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NLE_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_NLE_F16::Inst_VOPC__V_CMPX_NLE_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nle_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NLE_F16
+
+    Inst_VOPC__V_CMPX_NLE_F16::~Inst_VOPC__V_CMPX_NLE_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_NLE_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NEQ_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_NEQ_F16::Inst_VOPC__V_CMPX_NEQ_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_neq_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NEQ_F16
+
+    Inst_VOPC__V_CMPX_NEQ_F16::~Inst_VOPC__V_CMPX_NEQ_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_NEQ_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NLT_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_NLT_F16::Inst_VOPC__V_CMPX_NLT_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nlt_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NLT_F16
+
+    Inst_VOPC__V_CMPX_NLT_F16::~Inst_VOPC__V_CMPX_NLT_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_NLT_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_TRU_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_TRU_F16::Inst_VOPC__V_CMPX_TRU_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_tru_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_TRU_F16
+
+    Inst_VOPC__V_CMPX_TRU_F16::~Inst_VOPC__V_CMPX_TRU_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_TRU_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_F_F32 class methods ---
+
+    Inst_VOPC__V_CMP_F_F32::Inst_VOPC__V_CMP_F_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_f_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_F_F32
+
+    Inst_VOPC__V_CMP_F_F32::~Inst_VOPC__V_CMP_F_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_F_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_F_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LT_F32 class methods ---
+
+    Inst_VOPC__V_CMP_LT_F32::Inst_VOPC__V_CMP_LT_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lt_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_LT_F32
+
+    Inst_VOPC__V_CMP_LT_F32::~Inst_VOPC__V_CMP_LT_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_LT_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_EQ_F32 class methods ---
+
+    Inst_VOPC__V_CMP_EQ_F32::Inst_VOPC__V_CMP_EQ_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_eq_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_EQ_F32
+
+    Inst_VOPC__V_CMP_EQ_F32::~Inst_VOPC__V_CMP_EQ_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_EQ_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LE_F32 class methods ---
+
+    Inst_VOPC__V_CMP_LE_F32::Inst_VOPC__V_CMP_LE_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_le_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_LE_F32
+
+    Inst_VOPC__V_CMP_LE_F32::~Inst_VOPC__V_CMP_LE_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_LE_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GT_F32 class methods ---
+
+    Inst_VOPC__V_CMP_GT_F32::Inst_VOPC__V_CMP_GT_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_gt_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_GT_F32
+
+    Inst_VOPC__V_CMP_GT_F32::~Inst_VOPC__V_CMP_GT_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_GT_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LG_F32 class methods ---
+
+    Inst_VOPC__V_CMP_LG_F32::Inst_VOPC__V_CMP_LG_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lg_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_LG_F32
+
+    Inst_VOPC__V_CMP_LG_F32::~Inst_VOPC__V_CMP_LG_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_LG_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GE_F32 class methods ---
+
+    Inst_VOPC__V_CMP_GE_F32::Inst_VOPC__V_CMP_GE_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ge_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_GE_F32
+
+    Inst_VOPC__V_CMP_GE_F32::~Inst_VOPC__V_CMP_GE_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_GE_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_O_F32 class methods ---
+
+    Inst_VOPC__V_CMP_O_F32::Inst_VOPC__V_CMP_O_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_o_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_O_F32
+
+    Inst_VOPC__V_CMP_O_F32::~Inst_VOPC__V_CMP_O_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_O_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_O_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (!std::isnan(src0[lane])
+                    && !std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_U_F32 class methods ---
+
+    Inst_VOPC__V_CMP_U_F32::Inst_VOPC__V_CMP_U_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_u_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_U_F32
+
+    Inst_VOPC__V_CMP_U_F32::~Inst_VOPC__V_CMP_U_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_U_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_U_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (std::isnan(src0[lane])
+                    || std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NGE_F32 class methods ---
+
+    Inst_VOPC__V_CMP_NGE_F32::Inst_VOPC__V_CMP_NGE_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nge_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_NGE_F32
+
+    Inst_VOPC__V_CMP_NGE_F32::~Inst_VOPC__V_CMP_NGE_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_NGE_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NLG_F32 class methods ---
+
+    Inst_VOPC__V_CMP_NLG_F32::Inst_VOPC__V_CMP_NLG_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nlg_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_NLG_F32
+
+    Inst_VOPC__V_CMP_NLG_F32::~Inst_VOPC__V_CMP_NLG_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_NLG_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NGT_F32 class methods ---
+
+    Inst_VOPC__V_CMP_NGT_F32::Inst_VOPC__V_CMP_NGT_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ngt_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_NGT_F32
+
+    Inst_VOPC__V_CMP_NGT_F32::~Inst_VOPC__V_CMP_NGT_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_NGT_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NLE_F32 class methods ---
+
+    Inst_VOPC__V_CMP_NLE_F32::Inst_VOPC__V_CMP_NLE_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nle_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_NLE_F32
+
+    Inst_VOPC__V_CMP_NLE_F32::~Inst_VOPC__V_CMP_NLE_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_NLE_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NEQ_F32 class methods ---
+
+    Inst_VOPC__V_CMP_NEQ_F32::Inst_VOPC__V_CMP_NEQ_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_neq_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_NEQ_F32
+
+    Inst_VOPC__V_CMP_NEQ_F32::~Inst_VOPC__V_CMP_NEQ_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_NEQ_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NLT_F32 class methods ---
+
+    Inst_VOPC__V_CMP_NLT_F32::Inst_VOPC__V_CMP_NLT_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nlt_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_NLT_F32
+
+    Inst_VOPC__V_CMP_NLT_F32::~Inst_VOPC__V_CMP_NLT_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_NLT_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_TRU_F32 class methods ---
+
+    Inst_VOPC__V_CMP_TRU_F32::Inst_VOPC__V_CMP_TRU_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_tru_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_TRU_F32
+
+    Inst_VOPC__V_CMP_TRU_F32::~Inst_VOPC__V_CMP_TRU_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_TRU_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_F_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_F_F32::Inst_VOPC__V_CMPX_F_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_f_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_F_F32
+
+    Inst_VOPC__V_CMPX_F_F32::~Inst_VOPC__V_CMPX_F_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_F_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_F_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LT_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_LT_F32::Inst_VOPC__V_CMPX_LT_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lt_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LT_F32
+
+    Inst_VOPC__V_CMPX_LT_F32::~Inst_VOPC__V_CMPX_LT_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_LT_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_EQ_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_EQ_F32::Inst_VOPC__V_CMPX_EQ_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_eq_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_EQ_F32
+
+    Inst_VOPC__V_CMPX_EQ_F32::~Inst_VOPC__V_CMPX_EQ_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_EQ_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LE_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_LE_F32::Inst_VOPC__V_CMPX_LE_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_le_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LE_F32
+
+    Inst_VOPC__V_CMPX_LE_F32::~Inst_VOPC__V_CMPX_LE_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_LE_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GT_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_GT_F32::Inst_VOPC__V_CMPX_GT_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_gt_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GT_F32
+
+    Inst_VOPC__V_CMPX_GT_F32::~Inst_VOPC__V_CMPX_GT_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_GT_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LG_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_LG_F32::Inst_VOPC__V_CMPX_LG_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lg_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LG_F32
+
+    Inst_VOPC__V_CMPX_LG_F32::~Inst_VOPC__V_CMPX_LG_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_LG_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GE_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_GE_F32::Inst_VOPC__V_CMPX_GE_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ge_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GE_F32
+
+    Inst_VOPC__V_CMPX_GE_F32::~Inst_VOPC__V_CMPX_GE_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_GE_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_O_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_O_F32::Inst_VOPC__V_CMPX_O_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_o_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_O_F32
+
+    Inst_VOPC__V_CMPX_O_F32::~Inst_VOPC__V_CMPX_O_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_O_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOPC__V_CMPX_O_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (!std::isnan(src0[lane])
+                    && !std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_U_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_U_F32::Inst_VOPC__V_CMPX_U_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_u_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_U_F32
+
+    Inst_VOPC__V_CMPX_U_F32::~Inst_VOPC__V_CMPX_U_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_U_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOPC__V_CMPX_U_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (std::isnan(src0[lane])
+                    || std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NGE_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_NGE_F32::Inst_VOPC__V_CMPX_NGE_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nge_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NGE_F32
+
+    Inst_VOPC__V_CMPX_NGE_F32::~Inst_VOPC__V_CMPX_NGE_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_NGE_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NLG_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_NLG_F32::Inst_VOPC__V_CMPX_NLG_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nlg_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NLG_F32
+
+    Inst_VOPC__V_CMPX_NLG_F32::~Inst_VOPC__V_CMPX_NLG_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_NLG_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NGT_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_NGT_F32::Inst_VOPC__V_CMPX_NGT_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ngt_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NGT_F32
+
+    Inst_VOPC__V_CMPX_NGT_F32::~Inst_VOPC__V_CMPX_NGT_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_NGT_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NLE_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_NLE_F32::Inst_VOPC__V_CMPX_NLE_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nle_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NLE_F32
+
+    Inst_VOPC__V_CMPX_NLE_F32::~Inst_VOPC__V_CMPX_NLE_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_NLE_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NEQ_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_NEQ_F32::Inst_VOPC__V_CMPX_NEQ_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_neq_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NEQ_F32
+
+    Inst_VOPC__V_CMPX_NEQ_F32::~Inst_VOPC__V_CMPX_NEQ_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_NEQ_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] == src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NLT_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_NLT_F32::Inst_VOPC__V_CMPX_NLT_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nlt_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NLT_F32
+
+    Inst_VOPC__V_CMPX_NLT_F32::~Inst_VOPC__V_CMPX_NLT_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_NLT_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_TRU_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_TRU_F32::Inst_VOPC__V_CMPX_TRU_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_tru_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_TRU_F32
+
+    Inst_VOPC__V_CMPX_TRU_F32::~Inst_VOPC__V_CMPX_TRU_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_TRU_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMP_F_F64 class methods ---
+
+    Inst_VOPC__V_CMP_F_F64::Inst_VOPC__V_CMP_F_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_f_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_F_F64
+
+    Inst_VOPC__V_CMP_F_F64::~Inst_VOPC__V_CMP_F_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_F_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_F_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LT_F64 class methods ---
+
+    Inst_VOPC__V_CMP_LT_F64::Inst_VOPC__V_CMP_LT_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lt_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_LT_F64
+
+    Inst_VOPC__V_CMP_LT_F64::~Inst_VOPC__V_CMP_LT_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_LT_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_EQ_F64 class methods ---
+
+    Inst_VOPC__V_CMP_EQ_F64::Inst_VOPC__V_CMP_EQ_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_eq_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_EQ_F64
+
+    Inst_VOPC__V_CMP_EQ_F64::~Inst_VOPC__V_CMP_EQ_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_EQ_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LE_F64 class methods ---
+
+    Inst_VOPC__V_CMP_LE_F64::Inst_VOPC__V_CMP_LE_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_le_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_LE_F64
+
+    Inst_VOPC__V_CMP_LE_F64::~Inst_VOPC__V_CMP_LE_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_LE_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GT_F64 class methods ---
+
+    Inst_VOPC__V_CMP_GT_F64::Inst_VOPC__V_CMP_GT_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_gt_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_GT_F64
+
+    Inst_VOPC__V_CMP_GT_F64::~Inst_VOPC__V_CMP_GT_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_GT_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LG_F64 class methods ---
+
+    Inst_VOPC__V_CMP_LG_F64::Inst_VOPC__V_CMP_LG_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lg_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_LG_F64
+
+    Inst_VOPC__V_CMP_LG_F64::~Inst_VOPC__V_CMP_LG_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_LG_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LG_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GE_F64 class methods ---
+
+    Inst_VOPC__V_CMP_GE_F64::Inst_VOPC__V_CMP_GE_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ge_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_GE_F64
+
+    Inst_VOPC__V_CMP_GE_F64::~Inst_VOPC__V_CMP_GE_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_GE_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_O_F64 class methods ---
+
+    Inst_VOPC__V_CMP_O_F64::Inst_VOPC__V_CMP_O_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_o_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_O_F64
+
+    Inst_VOPC__V_CMP_O_F64::~Inst_VOPC__V_CMP_O_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_O_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_O_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (!std::isnan(src0[lane])
+                    && !std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_U_F64 class methods ---
+
+    Inst_VOPC__V_CMP_U_F64::Inst_VOPC__V_CMP_U_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_u_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_U_F64
+
+    Inst_VOPC__V_CMP_U_F64::~Inst_VOPC__V_CMP_U_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_U_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_U_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (std::isnan(src0[lane])
+                    || std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NGE_F64 class methods ---
+
+    Inst_VOPC__V_CMP_NGE_F64::Inst_VOPC__V_CMP_NGE_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nge_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_NGE_F64
+
+    Inst_VOPC__V_CMP_NGE_F64::~Inst_VOPC__V_CMP_NGE_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_NGE_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NLG_F64 class methods ---
+
+    Inst_VOPC__V_CMP_NLG_F64::Inst_VOPC__V_CMP_NLG_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nlg_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_NLG_F64
+
+    Inst_VOPC__V_CMP_NLG_F64::~Inst_VOPC__V_CMP_NLG_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_NLG_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NGT_F64 class methods ---
+
+    Inst_VOPC__V_CMP_NGT_F64::Inst_VOPC__V_CMP_NGT_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ngt_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_NGT_F64
+
+    Inst_VOPC__V_CMP_NGT_F64::~Inst_VOPC__V_CMP_NGT_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_NGT_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NLE_F64 class methods ---
+
+    Inst_VOPC__V_CMP_NLE_F64::Inst_VOPC__V_CMP_NLE_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nle_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_NLE_F64
+
+    Inst_VOPC__V_CMP_NLE_F64::~Inst_VOPC__V_CMP_NLE_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_NLE_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NEQ_F64 class methods ---
+
+    Inst_VOPC__V_CMP_NEQ_F64::Inst_VOPC__V_CMP_NEQ_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_neq_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_NEQ_F64
+
+    Inst_VOPC__V_CMP_NEQ_F64::~Inst_VOPC__V_CMP_NEQ_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_NEQ_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NLT_F64 class methods ---
+
+    Inst_VOPC__V_CMP_NLT_F64::Inst_VOPC__V_CMP_NLT_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nlt_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_NLT_F64
+
+    Inst_VOPC__V_CMP_NLT_F64::~Inst_VOPC__V_CMP_NLT_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_NLT_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_TRU_F64 class methods ---
+
+    Inst_VOPC__V_CMP_TRU_F64::Inst_VOPC__V_CMP_TRU_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_tru_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_TRU_F64
+
+    Inst_VOPC__V_CMP_TRU_F64::~Inst_VOPC__V_CMP_TRU_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_TRU_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_F_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_F_F64::Inst_VOPC__V_CMPX_F_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_f_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_F_F64
+
+    Inst_VOPC__V_CMPX_F_F64::~Inst_VOPC__V_CMPX_F_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_F_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_F_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LT_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_LT_F64::Inst_VOPC__V_CMPX_LT_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lt_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LT_F64
+
+    Inst_VOPC__V_CMPX_LT_F64::~Inst_VOPC__V_CMPX_LT_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_LT_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_EQ_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_EQ_F64::Inst_VOPC__V_CMPX_EQ_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_eq_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_EQ_F64
+
+    Inst_VOPC__V_CMPX_EQ_F64::~Inst_VOPC__V_CMPX_EQ_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_EQ_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LE_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_LE_F64::Inst_VOPC__V_CMPX_LE_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_le_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LE_F64
+
+    Inst_VOPC__V_CMPX_LE_F64::~Inst_VOPC__V_CMPX_LE_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_LE_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GT_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_GT_F64::Inst_VOPC__V_CMPX_GT_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_gt_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GT_F64
+
+    Inst_VOPC__V_CMPX_GT_F64::~Inst_VOPC__V_CMPX_GT_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_GT_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LG_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_LG_F64::Inst_VOPC__V_CMPX_LG_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lg_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LG_F64
+
+    Inst_VOPC__V_CMPX_LG_F64::~Inst_VOPC__V_CMPX_LG_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_LG_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LG_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GE_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_GE_F64::Inst_VOPC__V_CMPX_GE_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ge_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GE_F64
+
+    Inst_VOPC__V_CMPX_GE_F64::~Inst_VOPC__V_CMPX_GE_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_GE_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_O_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_O_F64::Inst_VOPC__V_CMPX_O_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_o_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_O_F64
+
+    Inst_VOPC__V_CMPX_O_F64::~Inst_VOPC__V_CMPX_O_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_O_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOPC__V_CMPX_O_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (!std::isnan(src0[lane])
+                    && !std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_U_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_U_F64::Inst_VOPC__V_CMPX_U_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_u_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_U_F64
+
+    Inst_VOPC__V_CMPX_U_F64::~Inst_VOPC__V_CMPX_U_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_U_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOPC__V_CMPX_U_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (std::isnan(src0[lane])
+                    || std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NGE_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_NGE_F64::Inst_VOPC__V_CMPX_NGE_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nge_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NGE_F64
+
+    Inst_VOPC__V_CMPX_NGE_F64::~Inst_VOPC__V_CMPX_NGE_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_NGE_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NLG_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_NLG_F64::Inst_VOPC__V_CMPX_NLG_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nlg_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NLG_F64
+
+    Inst_VOPC__V_CMPX_NLG_F64::~Inst_VOPC__V_CMPX_NLG_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_NLG_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NGT_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_NGT_F64::Inst_VOPC__V_CMPX_NGT_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ngt_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NGT_F64
+
+    Inst_VOPC__V_CMPX_NGT_F64::~Inst_VOPC__V_CMPX_NGT_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_NGT_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NLE_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_NLE_F64::Inst_VOPC__V_CMPX_NLE_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nle_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NLE_F64
+
+    Inst_VOPC__V_CMPX_NLE_F64::~Inst_VOPC__V_CMPX_NLE_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_NLE_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NEQ_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_NEQ_F64::Inst_VOPC__V_CMPX_NEQ_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_neq_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NEQ_F64
+
+    Inst_VOPC__V_CMPX_NEQ_F64::~Inst_VOPC__V_CMPX_NEQ_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_NEQ_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NLT_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_NLT_F64::Inst_VOPC__V_CMPX_NLT_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nlt_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NLT_F64
+
+    Inst_VOPC__V_CMPX_NLT_F64::~Inst_VOPC__V_CMPX_NLT_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_NLT_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_TRU_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_TRU_F64::Inst_VOPC__V_CMPX_TRU_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_tru_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_TRU_F64
+
+    Inst_VOPC__V_CMPX_TRU_F64::~Inst_VOPC__V_CMPX_TRU_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_TRU_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_F_I16 class methods ---
+
+    Inst_VOPC__V_CMP_F_I16::Inst_VOPC__V_CMP_F_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_f_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_F_I16
+
+    Inst_VOPC__V_CMP_F_I16::~Inst_VOPC__V_CMP_F_I16()
+    {
+    } // ~Inst_VOPC__V_CMP_F_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_F_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LT_I16 class methods ---
+
+    Inst_VOPC__V_CMP_LT_I16::Inst_VOPC__V_CMP_LT_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lt_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LT_I16
+
+    Inst_VOPC__V_CMP_LT_I16::~Inst_VOPC__V_CMP_LT_I16()
+    {
+    } // ~Inst_VOPC__V_CMP_LT_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LT_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_EQ_I16 class methods ---
+
+    Inst_VOPC__V_CMP_EQ_I16::Inst_VOPC__V_CMP_EQ_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_eq_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_EQ_I16
+
+    Inst_VOPC__V_CMP_EQ_I16::~Inst_VOPC__V_CMP_EQ_I16()
+    {
+    } // ~Inst_VOPC__V_CMP_EQ_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LE_I16 class methods ---
+
+    Inst_VOPC__V_CMP_LE_I16::Inst_VOPC__V_CMP_LE_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_le_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LE_I16
+
+    Inst_VOPC__V_CMP_LE_I16::~Inst_VOPC__V_CMP_LE_I16()
+    {
+    } // ~Inst_VOPC__V_CMP_LE_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GT_I16 class methods ---
+
+    Inst_VOPC__V_CMP_GT_I16::Inst_VOPC__V_CMP_GT_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_gt_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GT_I16
+
+    Inst_VOPC__V_CMP_GT_I16::~Inst_VOPC__V_CMP_GT_I16()
+    {
+    } // ~Inst_VOPC__V_CMP_GT_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GT_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NE_I16 class methods ---
+
+    Inst_VOPC__V_CMP_NE_I16::Inst_VOPC__V_CMP_NE_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ne_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_NE_I16
+
+    Inst_VOPC__V_CMP_NE_I16::~Inst_VOPC__V_CMP_NE_I16()
+    {
+    } // ~Inst_VOPC__V_CMP_NE_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GE_I16 class methods ---
+
+    Inst_VOPC__V_CMP_GE_I16::Inst_VOPC__V_CMP_GE_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ge_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GE_I16
+
+    Inst_VOPC__V_CMP_GE_I16::~Inst_VOPC__V_CMP_GE_I16()
+    {
+    } // ~Inst_VOPC__V_CMP_GE_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_T_I16 class methods ---
+
+    Inst_VOPC__V_CMP_T_I16::Inst_VOPC__V_CMP_T_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_t_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_T_I16
+
+    Inst_VOPC__V_CMP_T_I16::~Inst_VOPC__V_CMP_T_I16()
+    {
+    } // ~Inst_VOPC__V_CMP_T_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_T_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_F_U16 class methods ---
+
+    Inst_VOPC__V_CMP_F_U16::Inst_VOPC__V_CMP_F_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_f_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_F_U16
+
+    Inst_VOPC__V_CMP_F_U16::~Inst_VOPC__V_CMP_F_U16()
+    {
+    } // ~Inst_VOPC__V_CMP_F_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_F_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LT_U16 class methods ---
+
+    Inst_VOPC__V_CMP_LT_U16::Inst_VOPC__V_CMP_LT_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lt_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LT_U16
+
+    Inst_VOPC__V_CMP_LT_U16::~Inst_VOPC__V_CMP_LT_U16()
+    {
+    } // ~Inst_VOPC__V_CMP_LT_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LT_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_EQ_U16 class methods ---
+
+    Inst_VOPC__V_CMP_EQ_U16::Inst_VOPC__V_CMP_EQ_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_eq_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_EQ_U16
+
+    Inst_VOPC__V_CMP_EQ_U16::~Inst_VOPC__V_CMP_EQ_U16()
+    {
+    } // ~Inst_VOPC__V_CMP_EQ_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LE_U16 class methods ---
+
+    Inst_VOPC__V_CMP_LE_U16::Inst_VOPC__V_CMP_LE_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_le_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LE_U16
+
+    Inst_VOPC__V_CMP_LE_U16::~Inst_VOPC__V_CMP_LE_U16()
+    {
+    } // ~Inst_VOPC__V_CMP_LE_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GT_U16 class methods ---
+
+    Inst_VOPC__V_CMP_GT_U16::Inst_VOPC__V_CMP_GT_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_gt_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GT_U16
+
+    Inst_VOPC__V_CMP_GT_U16::~Inst_VOPC__V_CMP_GT_U16()
+    {
+    } // ~Inst_VOPC__V_CMP_GT_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GT_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NE_U16 class methods ---
+
+    Inst_VOPC__V_CMP_NE_U16::Inst_VOPC__V_CMP_NE_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ne_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_NE_U16
+
+    Inst_VOPC__V_CMP_NE_U16::~Inst_VOPC__V_CMP_NE_U16()
+    {
+    } // ~Inst_VOPC__V_CMP_NE_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GE_U16 class methods ---
+
+    Inst_VOPC__V_CMP_GE_U16::Inst_VOPC__V_CMP_GE_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ge_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GE_U16
+
+    Inst_VOPC__V_CMP_GE_U16::~Inst_VOPC__V_CMP_GE_U16()
+    {
+    } // ~Inst_VOPC__V_CMP_GE_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_T_U16 class methods ---
+
+    Inst_VOPC__V_CMP_T_U16::Inst_VOPC__V_CMP_T_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_t_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_T_U16
+
+    Inst_VOPC__V_CMP_T_U16::~Inst_VOPC__V_CMP_T_U16()
+    {
+    } // ~Inst_VOPC__V_CMP_T_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_T_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_F_I16 class methods ---
+
+    Inst_VOPC__V_CMPX_F_I16::Inst_VOPC__V_CMPX_F_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_f_i16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_F_I16
+
+    Inst_VOPC__V_CMPX_F_I16::~Inst_VOPC__V_CMPX_F_I16()
+    {
+    } // ~Inst_VOPC__V_CMPX_F_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_F_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LT_I16 class methods ---
+
+    Inst_VOPC__V_CMPX_LT_I16::Inst_VOPC__V_CMPX_LT_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lt_i16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LT_I16
+
+    Inst_VOPC__V_CMPX_LT_I16::~Inst_VOPC__V_CMPX_LT_I16()
+    {
+    } // ~Inst_VOPC__V_CMPX_LT_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LT_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_EQ_I16 class methods ---
+
+    Inst_VOPC__V_CMPX_EQ_I16::Inst_VOPC__V_CMPX_EQ_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_eq_i16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_EQ_I16
+
+    Inst_VOPC__V_CMPX_EQ_I16::~Inst_VOPC__V_CMPX_EQ_I16()
+    {
+    } // ~Inst_VOPC__V_CMPX_EQ_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LE_I16 class methods ---
+
+    Inst_VOPC__V_CMPX_LE_I16::Inst_VOPC__V_CMPX_LE_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_le_i16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LE_I16
+
+    Inst_VOPC__V_CMPX_LE_I16::~Inst_VOPC__V_CMPX_LE_I16()
+    {
+    } // ~Inst_VOPC__V_CMPX_LE_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GT_I16 class methods ---
+
+    Inst_VOPC__V_CMPX_GT_I16::Inst_VOPC__V_CMPX_GT_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_gt_i16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GT_I16
+
+    Inst_VOPC__V_CMPX_GT_I16::~Inst_VOPC__V_CMPX_GT_I16()
+    {
+    } // ~Inst_VOPC__V_CMPX_GT_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GT_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NE_I16 class methods ---
+
+    Inst_VOPC__V_CMPX_NE_I16::Inst_VOPC__V_CMPX_NE_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ne_i16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NE_I16
+
+    Inst_VOPC__V_CMPX_NE_I16::~Inst_VOPC__V_CMPX_NE_I16()
+    {
+    } // ~Inst_VOPC__V_CMPX_NE_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GE_I16 class methods ---
+
+    Inst_VOPC__V_CMPX_GE_I16::Inst_VOPC__V_CMPX_GE_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ge_i16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GE_I16
+
+    Inst_VOPC__V_CMPX_GE_I16::~Inst_VOPC__V_CMPX_GE_I16()
+    {
+    } // ~Inst_VOPC__V_CMPX_GE_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_T_I16 class methods ---
+
+    Inst_VOPC__V_CMPX_T_I16::Inst_VOPC__V_CMPX_T_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_t_i16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_T_I16
+
+    Inst_VOPC__V_CMPX_T_I16::~Inst_VOPC__V_CMPX_T_I16()
+    {
+    } // ~Inst_VOPC__V_CMPX_T_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_T_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_F_U16 class methods ---
+
+    Inst_VOPC__V_CMPX_F_U16::Inst_VOPC__V_CMPX_F_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_f_u16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_F_U16
+
+    Inst_VOPC__V_CMPX_F_U16::~Inst_VOPC__V_CMPX_F_U16()
+    {
+    } // ~Inst_VOPC__V_CMPX_F_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_F_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LT_U16 class methods ---
+
+    Inst_VOPC__V_CMPX_LT_U16::Inst_VOPC__V_CMPX_LT_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lt_u16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LT_U16
+
+    Inst_VOPC__V_CMPX_LT_U16::~Inst_VOPC__V_CMPX_LT_U16()
+    {
+    } // ~Inst_VOPC__V_CMPX_LT_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LT_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_EQ_U16 class methods ---
+
+    Inst_VOPC__V_CMPX_EQ_U16::Inst_VOPC__V_CMPX_EQ_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_eq_u16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_EQ_U16
+
+    Inst_VOPC__V_CMPX_EQ_U16::~Inst_VOPC__V_CMPX_EQ_U16()
+    {
+    } // ~Inst_VOPC__V_CMPX_EQ_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LE_U16 class methods ---
+
+    Inst_VOPC__V_CMPX_LE_U16::Inst_VOPC__V_CMPX_LE_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_le_u16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LE_U16
+
+    Inst_VOPC__V_CMPX_LE_U16::~Inst_VOPC__V_CMPX_LE_U16()
+    {
+    } // ~Inst_VOPC__V_CMPX_LE_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GT_U16 class methods ---
+
+    Inst_VOPC__V_CMPX_GT_U16::Inst_VOPC__V_CMPX_GT_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_gt_u16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GT_U16
+
+    Inst_VOPC__V_CMPX_GT_U16::~Inst_VOPC__V_CMPX_GT_U16()
+    {
+    } // ~Inst_VOPC__V_CMPX_GT_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GT_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NE_U16 class methods ---
+
+    Inst_VOPC__V_CMPX_NE_U16::Inst_VOPC__V_CMPX_NE_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ne_u16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NE_U16
+
+    Inst_VOPC__V_CMPX_NE_U16::~Inst_VOPC__V_CMPX_NE_U16()
+    {
+    } // ~Inst_VOPC__V_CMPX_NE_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GE_U16 class methods ---
+
+    Inst_VOPC__V_CMPX_GE_U16::Inst_VOPC__V_CMPX_GE_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ge_u16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GE_U16
+
+    Inst_VOPC__V_CMPX_GE_U16::~Inst_VOPC__V_CMPX_GE_U16()
+    {
+    } // ~Inst_VOPC__V_CMPX_GE_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_T_U16 class methods ---
+
+    Inst_VOPC__V_CMPX_T_U16::Inst_VOPC__V_CMPX_T_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_t_u16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_T_U16
+
+    Inst_VOPC__V_CMPX_T_U16::~Inst_VOPC__V_CMPX_T_U16()
+    {
+    } // ~Inst_VOPC__V_CMPX_T_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_T_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_F_I32 class methods ---
+
+    Inst_VOPC__V_CMP_F_I32::Inst_VOPC__V_CMP_F_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_f_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_F_I32
+
+    Inst_VOPC__V_CMP_F_I32::~Inst_VOPC__V_CMP_F_I32()
+    {
+    } // ~Inst_VOPC__V_CMP_F_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_F_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LT_I32 class methods ---
+
+    Inst_VOPC__V_CMP_LT_I32::Inst_VOPC__V_CMP_LT_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lt_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LT_I32
+
+    Inst_VOPC__V_CMP_LT_I32::~Inst_VOPC__V_CMP_LT_I32()
+    {
+    } // ~Inst_VOPC__V_CMP_LT_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_EQ_I32 class methods ---
+
+    Inst_VOPC__V_CMP_EQ_I32::Inst_VOPC__V_CMP_EQ_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_eq_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_EQ_I32
+
+    Inst_VOPC__V_CMP_EQ_I32::~Inst_VOPC__V_CMP_EQ_I32()
+    {
+    } // ~Inst_VOPC__V_CMP_EQ_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LE_I32 class methods ---
+
+    Inst_VOPC__V_CMP_LE_I32::Inst_VOPC__V_CMP_LE_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_le_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LE_I32
+
+    Inst_VOPC__V_CMP_LE_I32::~Inst_VOPC__V_CMP_LE_I32()
+    {
+    } // ~Inst_VOPC__V_CMP_LE_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GT_I32 class methods ---
+
+    Inst_VOPC__V_CMP_GT_I32::Inst_VOPC__V_CMP_GT_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_gt_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GT_I32
+
+    Inst_VOPC__V_CMP_GT_I32::~Inst_VOPC__V_CMP_GT_I32()
+    {
+    } // ~Inst_VOPC__V_CMP_GT_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NE_I32 class methods ---
+
+    Inst_VOPC__V_CMP_NE_I32::Inst_VOPC__V_CMP_NE_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ne_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_NE_I32
+
+    Inst_VOPC__V_CMP_NE_I32::~Inst_VOPC__V_CMP_NE_I32()
+    {
+    } // ~Inst_VOPC__V_CMP_NE_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GE_I32 class methods ---
+
+    Inst_VOPC__V_CMP_GE_I32::Inst_VOPC__V_CMP_GE_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ge_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GE_I32
+
+    Inst_VOPC__V_CMP_GE_I32::~Inst_VOPC__V_CMP_GE_I32()
+    {
+    } // ~Inst_VOPC__V_CMP_GE_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_T_I32 class methods ---
+
+    Inst_VOPC__V_CMP_T_I32::Inst_VOPC__V_CMP_T_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_t_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_T_I32
+
+    Inst_VOPC__V_CMP_T_I32::~Inst_VOPC__V_CMP_T_I32()
+    {
+    } // ~Inst_VOPC__V_CMP_T_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_T_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_F_U32 class methods ---
+
+    Inst_VOPC__V_CMP_F_U32::Inst_VOPC__V_CMP_F_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_f_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_F_U32
+
+    Inst_VOPC__V_CMP_F_U32::~Inst_VOPC__V_CMP_F_U32()
+    {
+    } // ~Inst_VOPC__V_CMP_F_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_F_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LT_U32 class methods ---
+
+    Inst_VOPC__V_CMP_LT_U32::Inst_VOPC__V_CMP_LT_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lt_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LT_U32
+
+    Inst_VOPC__V_CMP_LT_U32::~Inst_VOPC__V_CMP_LT_U32()
+    {
+    } // ~Inst_VOPC__V_CMP_LT_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_EQ_U32 class methods ---
+
+    Inst_VOPC__V_CMP_EQ_U32::Inst_VOPC__V_CMP_EQ_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_eq_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_EQ_U32
+
+    Inst_VOPC__V_CMP_EQ_U32::~Inst_VOPC__V_CMP_EQ_U32()
+    {
+    } // ~Inst_VOPC__V_CMP_EQ_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LE_U32 class methods ---
+
+    Inst_VOPC__V_CMP_LE_U32::Inst_VOPC__V_CMP_LE_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_le_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LE_U32
+
+    Inst_VOPC__V_CMP_LE_U32::~Inst_VOPC__V_CMP_LE_U32()
+    {
+    } // ~Inst_VOPC__V_CMP_LE_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GT_U32 class methods ---
+
+    Inst_VOPC__V_CMP_GT_U32::Inst_VOPC__V_CMP_GT_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_gt_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GT_U32
+
+    Inst_VOPC__V_CMP_GT_U32::~Inst_VOPC__V_CMP_GT_U32()
+    {
+    } // ~Inst_VOPC__V_CMP_GT_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NE_U32 class methods ---
+
+    Inst_VOPC__V_CMP_NE_U32::Inst_VOPC__V_CMP_NE_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ne_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_NE_U32
+
+    Inst_VOPC__V_CMP_NE_U32::~Inst_VOPC__V_CMP_NE_U32()
+    {
+    } // ~Inst_VOPC__V_CMP_NE_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GE_U32 class methods ---
+
+    Inst_VOPC__V_CMP_GE_U32::Inst_VOPC__V_CMP_GE_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ge_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GE_U32
+
+    Inst_VOPC__V_CMP_GE_U32::~Inst_VOPC__V_CMP_GE_U32()
+    {
+    } // ~Inst_VOPC__V_CMP_GE_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_T_U32 class methods ---
+
+    Inst_VOPC__V_CMP_T_U32::Inst_VOPC__V_CMP_T_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_t_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_T_U32
+
+    Inst_VOPC__V_CMP_T_U32::~Inst_VOPC__V_CMP_T_U32()
+    {
+    } // ~Inst_VOPC__V_CMP_T_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_T_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_F_I32 class methods ---
+
+    Inst_VOPC__V_CMPX_F_I32::Inst_VOPC__V_CMPX_F_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_f_i32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_F_I32
+
+    Inst_VOPC__V_CMPX_F_I32::~Inst_VOPC__V_CMPX_F_I32()
+    {
+    } // ~Inst_VOPC__V_CMPX_F_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_F_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LT_I32 class methods ---
+
+    Inst_VOPC__V_CMPX_LT_I32::Inst_VOPC__V_CMPX_LT_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lt_i32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LT_I32
+
+    Inst_VOPC__V_CMPX_LT_I32::~Inst_VOPC__V_CMPX_LT_I32()
+    {
+    } // ~Inst_VOPC__V_CMPX_LT_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_EQ_I32 class methods ---
+
+    Inst_VOPC__V_CMPX_EQ_I32::Inst_VOPC__V_CMPX_EQ_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_eq_i32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_EQ_I32
+
+    Inst_VOPC__V_CMPX_EQ_I32::~Inst_VOPC__V_CMPX_EQ_I32()
+    {
+    } // ~Inst_VOPC__V_CMPX_EQ_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LE_I32 class methods ---
+
+    Inst_VOPC__V_CMPX_LE_I32::Inst_VOPC__V_CMPX_LE_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_le_i32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LE_I32
+
+    Inst_VOPC__V_CMPX_LE_I32::~Inst_VOPC__V_CMPX_LE_I32()
+    {
+    } // ~Inst_VOPC__V_CMPX_LE_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GT_I32 class methods ---
+
+    Inst_VOPC__V_CMPX_GT_I32::Inst_VOPC__V_CMPX_GT_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_gt_i32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GT_I32
+
+    Inst_VOPC__V_CMPX_GT_I32::~Inst_VOPC__V_CMPX_GT_I32()
+    {
+    } // ~Inst_VOPC__V_CMPX_GT_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NE_I32 class methods ---
+
+    Inst_VOPC__V_CMPX_NE_I32::Inst_VOPC__V_CMPX_NE_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ne_i32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NE_I32
+
+    Inst_VOPC__V_CMPX_NE_I32::~Inst_VOPC__V_CMPX_NE_I32()
+    {
+    } // ~Inst_VOPC__V_CMPX_NE_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GE_I32 class methods ---
+
+    Inst_VOPC__V_CMPX_GE_I32::Inst_VOPC__V_CMPX_GE_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ge_i32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GE_I32
+
+    Inst_VOPC__V_CMPX_GE_I32::~Inst_VOPC__V_CMPX_GE_I32()
+    {
+    } // ~Inst_VOPC__V_CMPX_GE_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_T_I32 class methods ---
+
+    Inst_VOPC__V_CMPX_T_I32::Inst_VOPC__V_CMPX_T_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_t_i32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_T_I32
+
+    Inst_VOPC__V_CMPX_T_I32::~Inst_VOPC__V_CMPX_T_I32()
+    {
+    } // ~Inst_VOPC__V_CMPX_T_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_T_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_F_U32 class methods ---
+
+    Inst_VOPC__V_CMPX_F_U32::Inst_VOPC__V_CMPX_F_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_f_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_F_U32
+
+    Inst_VOPC__V_CMPX_F_U32::~Inst_VOPC__V_CMPX_F_U32()
+    {
+    } // ~Inst_VOPC__V_CMPX_F_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_F_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LT_U32 class methods ---
+
+    Inst_VOPC__V_CMPX_LT_U32::Inst_VOPC__V_CMPX_LT_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lt_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LT_U32
+
+    Inst_VOPC__V_CMPX_LT_U32::~Inst_VOPC__V_CMPX_LT_U32()
+    {
+    } // ~Inst_VOPC__V_CMPX_LT_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_EQ_U32 class methods ---
+
+    Inst_VOPC__V_CMPX_EQ_U32::Inst_VOPC__V_CMPX_EQ_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_eq_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_EQ_U32
+
+    Inst_VOPC__V_CMPX_EQ_U32::~Inst_VOPC__V_CMPX_EQ_U32()
+    {
+    } // ~Inst_VOPC__V_CMPX_EQ_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LE_U32 class methods ---
+
+    Inst_VOPC__V_CMPX_LE_U32::Inst_VOPC__V_CMPX_LE_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_le_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LE_U32
+
+    Inst_VOPC__V_CMPX_LE_U32::~Inst_VOPC__V_CMPX_LE_U32()
+    {
+    } // ~Inst_VOPC__V_CMPX_LE_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GT_U32 class methods ---
+
+    Inst_VOPC__V_CMPX_GT_U32::Inst_VOPC__V_CMPX_GT_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_gt_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GT_U32
+
+    Inst_VOPC__V_CMPX_GT_U32::~Inst_VOPC__V_CMPX_GT_U32()
+    {
+    } // ~Inst_VOPC__V_CMPX_GT_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NE_U32 class methods ---
+
+    Inst_VOPC__V_CMPX_NE_U32::Inst_VOPC__V_CMPX_NE_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ne_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NE_U32
+
+    Inst_VOPC__V_CMPX_NE_U32::~Inst_VOPC__V_CMPX_NE_U32()
+    {
+    } // ~Inst_VOPC__V_CMPX_NE_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GE_U32 class methods ---
+
+    Inst_VOPC__V_CMPX_GE_U32::Inst_VOPC__V_CMPX_GE_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ge_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GE_U32
+
+    Inst_VOPC__V_CMPX_GE_U32::~Inst_VOPC__V_CMPX_GE_U32()
+    {
+    } // ~Inst_VOPC__V_CMPX_GE_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_T_U32 class methods ---
+
+    Inst_VOPC__V_CMPX_T_U32::Inst_VOPC__V_CMPX_T_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_t_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_T_U32
+
+    Inst_VOPC__V_CMPX_T_U32::~Inst_VOPC__V_CMPX_T_U32()
+    {
+    } // ~Inst_VOPC__V_CMPX_T_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_T_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_F_I64 class methods ---
+
+    Inst_VOPC__V_CMP_F_I64::Inst_VOPC__V_CMP_F_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_f_i64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_F_I64
+
+    Inst_VOPC__V_CMP_F_I64::~Inst_VOPC__V_CMP_F_I64()
+    {
+    } // ~Inst_VOPC__V_CMP_F_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_F_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LT_I64 class methods ---
+
+    Inst_VOPC__V_CMP_LT_I64::Inst_VOPC__V_CMP_LT_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lt_i64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LT_I64
+
+    Inst_VOPC__V_CMP_LT_I64::~Inst_VOPC__V_CMP_LT_I64()
+    {
+    } // ~Inst_VOPC__V_CMP_LT_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LT_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_EQ_I64 class methods ---
+
+    Inst_VOPC__V_CMP_EQ_I64::Inst_VOPC__V_CMP_EQ_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_eq_i64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_EQ_I64
+
+    Inst_VOPC__V_CMP_EQ_I64::~Inst_VOPC__V_CMP_EQ_I64()
+    {
+    } // ~Inst_VOPC__V_CMP_EQ_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LE_I64 class methods ---
+
+    Inst_VOPC__V_CMP_LE_I64::Inst_VOPC__V_CMP_LE_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_le_i64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LE_I64
+
+    Inst_VOPC__V_CMP_LE_I64::~Inst_VOPC__V_CMP_LE_I64()
+    {
+    } // ~Inst_VOPC__V_CMP_LE_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GT_I64 class methods ---
+
+    Inst_VOPC__V_CMP_GT_I64::Inst_VOPC__V_CMP_GT_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_gt_i64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GT_I64
+
+    Inst_VOPC__V_CMP_GT_I64::~Inst_VOPC__V_CMP_GT_I64()
+    {
+    } // ~Inst_VOPC__V_CMP_GT_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GT_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NE_I64 class methods ---
+
+    Inst_VOPC__V_CMP_NE_I64::Inst_VOPC__V_CMP_NE_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ne_i64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_NE_I64
+
+    Inst_VOPC__V_CMP_NE_I64::~Inst_VOPC__V_CMP_NE_I64()
+    {
+    } // ~Inst_VOPC__V_CMP_NE_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GE_I64 class methods ---
+
+    Inst_VOPC__V_CMP_GE_I64::Inst_VOPC__V_CMP_GE_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ge_i64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GE_I64
+
+    Inst_VOPC__V_CMP_GE_I64::~Inst_VOPC__V_CMP_GE_I64()
+    {
+    } // ~Inst_VOPC__V_CMP_GE_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_T_I64 class methods ---
+
+    Inst_VOPC__V_CMP_T_I64::Inst_VOPC__V_CMP_T_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_t_i64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_T_I64
+
+    Inst_VOPC__V_CMP_T_I64::~Inst_VOPC__V_CMP_T_I64()
+    {
+    } // ~Inst_VOPC__V_CMP_T_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_T_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_F_U64 class methods ---
+
+    Inst_VOPC__V_CMP_F_U64::Inst_VOPC__V_CMP_F_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_f_u64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_F_U64
+
+    Inst_VOPC__V_CMP_F_U64::~Inst_VOPC__V_CMP_F_U64()
+    {
+    } // ~Inst_VOPC__V_CMP_F_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_F_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LT_U64 class methods ---
+
+    Inst_VOPC__V_CMP_LT_U64::Inst_VOPC__V_CMP_LT_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lt_u64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LT_U64
+
+    Inst_VOPC__V_CMP_LT_U64::~Inst_VOPC__V_CMP_LT_U64()
+    {
+    } // ~Inst_VOPC__V_CMP_LT_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LT_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_EQ_U64 class methods ---
+
+    Inst_VOPC__V_CMP_EQ_U64::Inst_VOPC__V_CMP_EQ_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_eq_u64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_EQ_U64
+
+    Inst_VOPC__V_CMP_EQ_U64::~Inst_VOPC__V_CMP_EQ_U64()
+    {
+    } // ~Inst_VOPC__V_CMP_EQ_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LE_U64 class methods ---
+
+    Inst_VOPC__V_CMP_LE_U64::Inst_VOPC__V_CMP_LE_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_le_u64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LE_U64
+
+    Inst_VOPC__V_CMP_LE_U64::~Inst_VOPC__V_CMP_LE_U64()
+    {
+    } // ~Inst_VOPC__V_CMP_LE_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GT_U64 class methods ---
+
+    Inst_VOPC__V_CMP_GT_U64::Inst_VOPC__V_CMP_GT_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_gt_u64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GT_U64
+
+    Inst_VOPC__V_CMP_GT_U64::~Inst_VOPC__V_CMP_GT_U64()
+    {
+    } // ~Inst_VOPC__V_CMP_GT_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GT_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NE_U64 class methods ---
+
+    Inst_VOPC__V_CMP_NE_U64::Inst_VOPC__V_CMP_NE_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ne_u64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_NE_U64
+
+    Inst_VOPC__V_CMP_NE_U64::~Inst_VOPC__V_CMP_NE_U64()
+    {
+    } // ~Inst_VOPC__V_CMP_NE_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GE_U64 class methods ---
+
+    Inst_VOPC__V_CMP_GE_U64::Inst_VOPC__V_CMP_GE_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ge_u64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GE_U64
+
+    Inst_VOPC__V_CMP_GE_U64::~Inst_VOPC__V_CMP_GE_U64()
+    {
+    } // ~Inst_VOPC__V_CMP_GE_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_T_U64 class methods ---
+
+    Inst_VOPC__V_CMP_T_U64::Inst_VOPC__V_CMP_T_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_t_u64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_T_U64
+
+    Inst_VOPC__V_CMP_T_U64::~Inst_VOPC__V_CMP_T_U64()
+    {
+    } // ~Inst_VOPC__V_CMP_T_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_T_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_F_I64 class methods ---
+
+    Inst_VOPC__V_CMPX_F_I64::Inst_VOPC__V_CMPX_F_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_f_i64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_F_I64
+
+    Inst_VOPC__V_CMPX_F_I64::~Inst_VOPC__V_CMPX_F_I64()
+    {
+    } // ~Inst_VOPC__V_CMPX_F_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_F_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LT_I64 class methods ---
+
+    Inst_VOPC__V_CMPX_LT_I64::Inst_VOPC__V_CMPX_LT_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lt_i64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LT_I64
+
+    Inst_VOPC__V_CMPX_LT_I64::~Inst_VOPC__V_CMPX_LT_I64()
+    {
+    } // ~Inst_VOPC__V_CMPX_LT_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LT_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_EQ_I64 class methods ---
+
+    Inst_VOPC__V_CMPX_EQ_I64::Inst_VOPC__V_CMPX_EQ_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_eq_i64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_EQ_I64
+
+    Inst_VOPC__V_CMPX_EQ_I64::~Inst_VOPC__V_CMPX_EQ_I64()
+    {
+    } // ~Inst_VOPC__V_CMPX_EQ_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LE_I64 class methods ---
+
+    Inst_VOPC__V_CMPX_LE_I64::Inst_VOPC__V_CMPX_LE_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_le_i64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LE_I64
+
+    Inst_VOPC__V_CMPX_LE_I64::~Inst_VOPC__V_CMPX_LE_I64()
+    {
+    } // ~Inst_VOPC__V_CMPX_LE_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GT_I64 class methods ---
+
+    Inst_VOPC__V_CMPX_GT_I64::Inst_VOPC__V_CMPX_GT_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_gt_i64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GT_I64
+
+    Inst_VOPC__V_CMPX_GT_I64::~Inst_VOPC__V_CMPX_GT_I64()
+    {
+    } // ~Inst_VOPC__V_CMPX_GT_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GT_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NE_I64 class methods ---
+
+    Inst_VOPC__V_CMPX_NE_I64::Inst_VOPC__V_CMPX_NE_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ne_i64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NE_I64
+
+    Inst_VOPC__V_CMPX_NE_I64::~Inst_VOPC__V_CMPX_NE_I64()
+    {
+    } // ~Inst_VOPC__V_CMPX_NE_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GE_I64 class methods ---
+
+    Inst_VOPC__V_CMPX_GE_I64::Inst_VOPC__V_CMPX_GE_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ge_i64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GE_I64
+
+    Inst_VOPC__V_CMPX_GE_I64::~Inst_VOPC__V_CMPX_GE_I64()
+    {
+    } // ~Inst_VOPC__V_CMPX_GE_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_T_I64 class methods ---
+
+    Inst_VOPC__V_CMPX_T_I64::Inst_VOPC__V_CMPX_T_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_t_i64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_T_I64
+
+    Inst_VOPC__V_CMPX_T_I64::~Inst_VOPC__V_CMPX_T_I64()
+    {
+    } // ~Inst_VOPC__V_CMPX_T_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_T_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_F_U64 class methods ---
+
+    Inst_VOPC__V_CMPX_F_U64::Inst_VOPC__V_CMPX_F_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_f_u64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_F_U64
+
+    Inst_VOPC__V_CMPX_F_U64::~Inst_VOPC__V_CMPX_F_U64()
+    {
+    } // ~Inst_VOPC__V_CMPX_F_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_F_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LT_U64 class methods ---
+
+    Inst_VOPC__V_CMPX_LT_U64::Inst_VOPC__V_CMPX_LT_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lt_u64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LT_U64
+
+    Inst_VOPC__V_CMPX_LT_U64::~Inst_VOPC__V_CMPX_LT_U64()
+    {
+    } // ~Inst_VOPC__V_CMPX_LT_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LT_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_EQ_U64 class methods ---
+
+    Inst_VOPC__V_CMPX_EQ_U64::Inst_VOPC__V_CMPX_EQ_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_eq_u64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_EQ_U64
+
+    Inst_VOPC__V_CMPX_EQ_U64::~Inst_VOPC__V_CMPX_EQ_U64()
+    {
+    } // ~Inst_VOPC__V_CMPX_EQ_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LE_U64 class methods ---
+
+    Inst_VOPC__V_CMPX_LE_U64::Inst_VOPC__V_CMPX_LE_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_le_u64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LE_U64
+
+    Inst_VOPC__V_CMPX_LE_U64::~Inst_VOPC__V_CMPX_LE_U64()
+    {
+    } // ~Inst_VOPC__V_CMPX_LE_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GT_U64 class methods ---
+
+    Inst_VOPC__V_CMPX_GT_U64::Inst_VOPC__V_CMPX_GT_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_gt_u64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GT_U64
+
+    Inst_VOPC__V_CMPX_GT_U64::~Inst_VOPC__V_CMPX_GT_U64()
+    {
+    } // ~Inst_VOPC__V_CMPX_GT_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GT_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NE_U64 class methods ---
+
+    Inst_VOPC__V_CMPX_NE_U64::Inst_VOPC__V_CMPX_NE_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ne_u64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NE_U64
+
+    Inst_VOPC__V_CMPX_NE_U64::~Inst_VOPC__V_CMPX_NE_U64()
+    {
+    } // ~Inst_VOPC__V_CMPX_NE_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GE_U64 class methods ---
+
+    Inst_VOPC__V_CMPX_GE_U64::Inst_VOPC__V_CMPX_GE_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ge_u64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GE_U64
+
+    Inst_VOPC__V_CMPX_GE_U64::~Inst_VOPC__V_CMPX_GE_U64()
+    {
+    } // ~Inst_VOPC__V_CMPX_GE_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_T_U64 class methods ---
+
+    Inst_VOPC__V_CMPX_T_U64::Inst_VOPC__V_CMPX_T_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_t_u64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_T_U64
+
+    Inst_VOPC__V_CMPX_T_U64::~Inst_VOPC__V_CMPX_T_U64()
+    {
+    } // ~Inst_VOPC__V_CMPX_T_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_T_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5

From a5757e7e012b80b97db330f7560c06ea967d7c99 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Fri, 19 Jan 2024 13:30:42 -0600
Subject: [PATCH 2/2] arch-vega: Rename mismatched source/header files

The files registers.cc, isa.cc, and decoder.cc do not match the header
name. This is a minor cleanup to make development more straightforward.

Change-Id: Ibab18dfe315b0ce84359939b490f8227ea43cac0
---
 src/arch/amdgpu/vega/SConscript                         | 6 +++---
 src/arch/amdgpu/vega/{decoder.cc => gpu_decoder.cc}     | 3 ++-
 src/arch/amdgpu/vega/{isa.cc => gpu_isa.cc}             | 0
 src/arch/amdgpu/vega/{registers.cc => gpu_registers.cc} | 0
 4 files changed, 5 insertions(+), 4 deletions(-)
 rename src/arch/amdgpu/vega/{decoder.cc => gpu_decoder.cc} (99%)
 rename src/arch/amdgpu/vega/{isa.cc => gpu_isa.cc} (100%)
 rename src/arch/amdgpu/vega/{registers.cc => gpu_registers.cc} (100%)

diff --git a/src/arch/amdgpu/vega/SConscript b/src/arch/amdgpu/vega/SConscript
index 912c02cfdc..b7a28a8d6c 100644
--- a/src/arch/amdgpu/vega/SConscript
+++ b/src/arch/amdgpu/vega/SConscript
@@ -49,12 +49,12 @@ Source('tlb_coalescer.cc')
 DebugFlag('GPUPTWalker', 'Debug flag for GPU page table walker')
 
 if env['CONF']['TARGET_GPU_ISA'] == 'vega':
-    Source('decoder.cc')
+    Source('gpu_decoder.cc')
     Source('insts/gpu_static_inst.cc')
     Source('insts/instructions.cc')
     Source('insts/op_encodings.cc')
-    Source('isa.cc')
-    Source('registers.cc')
+    Source('gpu_isa.cc')
+    Source('gpu_registers.cc')
 
     Source('insts/sop2.cc')
     Source('insts/sopk.cc')
diff --git a/src/arch/amdgpu/vega/decoder.cc b/src/arch/amdgpu/vega/gpu_decoder.cc
similarity index 99%
rename from src/arch/amdgpu/vega/decoder.cc
rename to src/arch/amdgpu/vega/gpu_decoder.cc
index 5e2402a4af..940840719b 100644
--- a/src/arch/amdgpu/vega/decoder.cc
+++ b/src/arch/amdgpu/vega/gpu_decoder.cc
@@ -29,9 +29,10 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include "arch/amdgpu/vega/gpu_decoder.hh"
+
 #include <vector>
 
-#include "arch/amdgpu/vega/gpu_decoder.hh"
 #include "arch/amdgpu/vega/insts/gpu_static_inst.hh"
 #include "arch/amdgpu/vega/insts/instructions.hh"
 #include "arch/amdgpu/vega/insts/vop3p.hh"
diff --git a/src/arch/amdgpu/vega/isa.cc b/src/arch/amdgpu/vega/gpu_isa.cc
similarity index 100%
rename from src/arch/amdgpu/vega/isa.cc
rename to src/arch/amdgpu/vega/gpu_isa.cc
diff --git a/src/arch/amdgpu/vega/registers.cc b/src/arch/amdgpu/vega/gpu_registers.cc
similarity index 100%
rename from src/arch/amdgpu/vega/registers.cc
rename to src/arch/amdgpu/vega/gpu_registers.cc