arch-vega: Add Vega ISA as a copy of GCN3

This changeset adds Vega support as a copy of GCN3. Configs have been modified to include both ISAs. Current implementation is not complete and needs modifications to fully comply with the ISA manual: https://developer.amd.com/wp-content/resources/ Vega_Shader_ISA_28July2017.pdf Change-Id: I608aa6747a45594f8e1bd7802da1883cf612168b Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/42204 Tested-by: kokoro <noreply+kokoro@google.com> Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
2019-06-27 12:22:29 -04:00
parent c7ee47efc9
commit f7d4ff6ef5
20 changed files with 144242 additions and 1 deletions
--- a/MAINTAINERS.yaml
+++ b/MAINTAINERS.yaml
@@ -68,6 +68,12 @@ arch-gcn3:
    - Matt Poremba <matthew.poremba@amd.com>
    - Matt Sinclair <sinclair@cs.wisc.edu>

+arch-vega:
+  status: maintained
+  maintainers:
+    - Matt Poremba <matthew.poremba@amd.com>
+    - Matt Sinclair <sinclair@cs.wisc.edu>
+
 arch-mips:
  status: orphaned

--- a/src/arch/SConscript
+++ b/src/arch/SConscript
@@ -67,7 +67,7 @@ env.SwitchingHeaders(
        '''),
    env.subst('${TARGET_ISA}'))

-amdgpu_isa = ['gcn3']
+amdgpu_isa = ['gcn3', 'vega']

 env.SwitchingHeaders(
    Split('''
--- a/src/arch/amdgpu/vega/SConscript
+++ b/src/arch/amdgpu/vega/SConscript
@@ -0,0 +1,45 @@
+# -*- mode:python -*-
+
+# Copyright (c) 2021 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+
+Import('*')
+
+if env['TARGET_GPU_ISA'] == 'vega':
+    Source('decoder.cc')
+    Source('insts/gpu_static_inst.cc')
+    Source('insts/instructions.cc')
+    Source('insts/op_encodings.cc')
+    Source('isa.cc')
+    Source('registers.cc')
+    DebugFlag('VEGA', 'Debug flag for VEGA GPU ISA')
--- a/src/arch/amdgpu/vega/SConsopts
+++ b/src/arch/amdgpu/vega/SConsopts
@@ -0,0 +1,36 @@
+# -*- mode:python -*-
+
+# Copyright (c) 2021 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+Import('*')
+
+all_gpu_isa_list.append('vega')
--- a/src/arch/amdgpu/vega/decoder.cc
+++ b/src/arch/amdgpu/vega/decoder.cc
--- a/src/arch/amdgpu/vega/gpu_decoder.hh
+++ b/src/arch/amdgpu/vega/gpu_decoder.hh
--- a/src/arch/amdgpu/vega/gpu_isa.hh
+++ b/src/arch/amdgpu/vega/gpu_isa.hh
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2016-2021 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_VEGA_GPU_ISA_HH__
+#define __ARCH_VEGA_GPU_ISA_HH__
+
+#include <array>
+#include <type_traits>
+
+#include "arch/amdgpu/vega/gpu_registers.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/hsa_queue_entry.hh"
+#include "gpu-compute/misc.hh"
+
+class Wavefront;
+
+namespace VegaISA
+{
+    class GPUISA
+    {
+      public:
+        GPUISA(Wavefront &wf);
+
+        template<typename T> T
+        readConstVal(int opIdx) const
+        {
+            panic_if(!std::is_integral<T>::value, "Constant values must "
+                     "be an integer.\n");
+            T val(0);
+
+            if (isPosConstVal(opIdx)) {
+                val = (T)readPosConstReg(opIdx);
+            }
+
+            if (isNegConstVal(opIdx)) {
+                val = (T)readNegConstReg(opIdx);
+            }
+
+            return val;
+        }
+
+        ScalarRegU32 readMiscReg(int opIdx) const;
+        void writeMiscReg(int opIdx, ScalarRegU32 operandVal);
+        bool hasScalarUnit() const { return true; }
+        void advancePC(GPUDynInstPtr gpuDynInst);
+
+      private:
+        ScalarRegU32 readPosConstReg(int opIdx) const
+        {
+            return posConstRegs[opIdx - REG_INT_CONST_POS_MIN];
+        }
+
+        ScalarRegI32 readNegConstReg(int opIdx) const
+        {
+            return negConstRegs[opIdx - REG_INT_CONST_NEG_MIN];
+        }
+
+        static const std::array<const ScalarRegU32, NumPosConstRegs>
+            posConstRegs;
+        static const std::array<const ScalarRegI32, NumNegConstRegs>
+            negConstRegs;
+
+        // parent wavefront
+        Wavefront &wavefront;
+
+        // shader status bits
+        StatusReg statusReg;
+        // memory descriptor reg
+        ScalarRegU32 m0;
+    };
+} // namespace VegaISA
+
+#endif // __ARCH_VEGA_GPU_ISA_HH__
--- a/src/arch/amdgpu/vega/gpu_mem_helpers.hh
+++ b/src/arch/amdgpu/vega/gpu_mem_helpers.hh
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_VEGA_GPU_MEM_HELPERS_HH__
+#define __ARCH_VEGA_GPU_MEM_HELPERS_HH__
+
+#include "arch/amdgpu/vega/insts/gpu_static_inst.hh"
+#include "arch/amdgpu/vega/insts/op_encodings.hh"
+#include "debug/GPUMem.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+
+/**
+ * Helper function for instructions declared in op_encodings.  This function
+ * takes in all of the arguments for a given memory request we are trying to
+ * initialize, then submits the request or requests depending on if the
+ * original request is aligned or unaligned.
+ */
+template<typename T, int N>
+inline void
+initMemReqHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type,
+                 bool is_atomic=false)
+{
+    // local variables
+    int req_size = N * sizeof(T);
+    int block_size = gpuDynInst->computeUnit()->cacheLineSize();
+    Addr vaddr = 0, split_addr = 0;
+    bool misaligned_acc = false;
+    RequestPtr req = nullptr, req1 = nullptr, req2 = nullptr;
+    PacketPtr pkt = nullptr, pkt1 = nullptr, pkt2 = nullptr;
+
+    gpuDynInst->resetEntireStatusVector();
+    for (int lane = 0; lane < VegaISA::NumVecElemPerVecReg; ++lane) {
+        if (gpuDynInst->exec_mask[lane]) {
+            vaddr = gpuDynInst->addr[lane];
+
+            /**
+             * the base address of the cache line where the the last
+             * byte of the request will be stored.
+             */
+            split_addr = roundDown(vaddr + req_size - 1, block_size);
+
+            assert(split_addr <= vaddr || split_addr - vaddr < block_size);
+            /**
+             * if the base cache line address of the last byte is
+             * greater than the address of the first byte then we have
+             * a misaligned access.
+             */
+            misaligned_acc = split_addr > vaddr;
+
+            if (is_atomic) {
+                // make sure request is word aligned
+                assert((vaddr & 0x3) == 0);
+
+                // a given lane's atomic can't cross cache lines
+                assert(!misaligned_acc);
+
+                req = std::make_shared<Request>(0, vaddr, sizeof(T), 0,
+                    gpuDynInst->computeUnit()->masterId(), 0,
+                    gpuDynInst->wfDynId,
+                    gpuDynInst->makeAtomicOpFunctor<T>(
+                        &(reinterpret_cast<T*>(gpuDynInst->a_data))[lane],
+                        &(reinterpret_cast<T*>(gpuDynInst->x_data))[lane]));
+            } else {
+                req = std::make_shared<Request>(0, vaddr, req_size, 0,
+                                  gpuDynInst->computeUnit()->masterId(), 0,
+                                  gpuDynInst->wfDynId);
+            }
+
+            if (misaligned_acc) {
+                gpuDynInst->setStatusVector(lane, 2);
+                req->splitOnVaddr(split_addr, req1, req2);
+                gpuDynInst->setRequestFlags(req1);
+                gpuDynInst->setRequestFlags(req2);
+                pkt1 = new Packet(req1, mem_req_type);
+                pkt2 = new Packet(req2, mem_req_type);
+                pkt1->dataStatic(&(reinterpret_cast<T*>(
+                    gpuDynInst->d_data))[lane * N]);
+                pkt2->dataStatic(&(reinterpret_cast<T*>(
+                    gpuDynInst->d_data))[lane * N + req1->getSize()]);
+                DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index: %d unaligned memory "
+                        "request for %#x\n", gpuDynInst->cu_id,
+                        gpuDynInst->simdId, gpuDynInst->wfSlotId, lane,
+                        split_addr);
+                gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt1);
+                gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt2);
+            } else {
+                gpuDynInst->setStatusVector(lane, 1);
+                gpuDynInst->setRequestFlags(req);
+                pkt = new Packet(req, mem_req_type);
+                pkt->dataStatic(&(reinterpret_cast<T*>(
+                    gpuDynInst->d_data))[lane * N]);
+                gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt);
+            }
+        } else { // if lane is not active, then no pending requests
+            gpuDynInst->setStatusVector(lane, 0);
+        }
+    }
+}
+
+/**
+ * Helper function for scalar instructions declared in op_encodings.  This
+ * function takes in all of the arguments for a given memory request we are
+ * trying to initialize, then submits the request or requests depending on if
+ * the original request is aligned or unaligned.
+ */
+template<typename T, int N>
+inline void
+initMemReqScalarHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type)
+{
+    int req_size = N * sizeof(T);
+    int block_size = gpuDynInst->computeUnit()->cacheLineSize();
+    Addr vaddr = gpuDynInst->scalarAddr;
+
+    /**
+     * the base address of the cache line where the the last byte of
+     * the request will be stored.
+     */
+    Addr split_addr = roundDown(vaddr + req_size - 1, block_size);
+
+    assert(split_addr <= vaddr || split_addr - vaddr < block_size);
+    /**
+     * if the base cache line address of the last byte is greater
+     * than the address of the first byte then we have a misaligned
+     * access.
+     */
+    bool misaligned_acc = split_addr > vaddr;
+
+    RequestPtr req = std::make_shared<Request>(0, vaddr, req_size, 0,
+                                 gpuDynInst->computeUnit()->masterId(), 0,
+                                 gpuDynInst->wfDynId);
+
+    if (misaligned_acc) {
+        RequestPtr req1, req2;
+        req->splitOnVaddr(split_addr, req1, req2);
+        gpuDynInst->numScalarReqs = 2;
+        gpuDynInst->setRequestFlags(req1);
+        gpuDynInst->setRequestFlags(req2);
+        PacketPtr pkt1 = new Packet(req1, mem_req_type);
+        PacketPtr pkt2 = new Packet(req2, mem_req_type);
+        pkt1->dataStatic(gpuDynInst->scalar_data);
+        pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
+        DPRINTF(GPUMem, "CU%d: WF[%d][%d]: unaligned scalar memory request for"
+                " %#x\n", gpuDynInst->cu_id, gpuDynInst->simdId,
+                gpuDynInst->wfSlotId, split_addr);
+        gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
+        gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
+    } else {
+        gpuDynInst->numScalarReqs = 1;
+        gpuDynInst->setRequestFlags(req);
+        PacketPtr pkt = new Packet(req, mem_req_type);
+        pkt->dataStatic(gpuDynInst->scalar_data);
+        gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);
+    }
+}
+
+#endif // __ARCH_VEGA_GPU_MEM_HELPERS_HH__
--- a/src/arch/amdgpu/vega/gpu_registers.hh
+++ b/src/arch/amdgpu/vega/gpu_registers.hh
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_VEGA_REGISTERS_HH__
+#define __ARCH_VEGA_REGISTERS_HH__
+
+#include <array>
+#include <cstdint>
+#include <string>
+
+#include "arch/generic/vec_reg.hh"
+#include "base/intmath.hh"
+#include "base/logging.hh"
+
+namespace VegaISA
+{
+    enum OpSelector : int
+    {
+        REG_SGPR_MIN = 0,
+        REG_SGPR_MAX = 101,
+        REG_FLAT_SCRATCH_LO = 102,
+        REG_FLAT_SCRATCH_HI = 103,
+        REG_XNACK_MASK_LO = 104,
+        REG_XNACK_MASK_HI = 105,
+        REG_VCC_LO = 106,
+        REG_VCC_HI = 107,
+        REG_TBA_LO = 108,
+        REG_TBA_HI = 109,
+        REG_TMA_LO = 110,
+        REG_TMA_HI = 111,
+        REG_TTMP_0 = 112,
+        REG_TTMP_1 = 113,
+        REG_TTMP_2 = 114,
+        REG_TTMP_3 = 115,
+        REG_TTMP_4 = 116,
+        REG_TTMP_5 = 117,
+        REG_TTMP_6 = 118,
+        REG_TTMP_7 = 119,
+        REG_TTMP_8 = 120,
+        REG_TTMP_9 = 121,
+        REG_TTMP_10 = 122,
+        REG_TTMP_11 = 123,
+        REG_M0 = 124,
+        REG_RESERVED_1 = 125,
+        REG_EXEC_LO = 126,
+        REG_EXEC_HI = 127,
+        REG_ZERO = 128,
+        REG_INT_CONST_POS_MIN = 129,
+        REG_INT_CONST_POS_MAX = 192,
+        REG_INT_CONST_NEG_MIN = 193,
+        REG_INT_CONST_NEG_MAX = 208,
+        REG_RESERVED_2 = 209,
+        REG_RESERVED_3 = 210,
+        REG_RESERVED_4 = 211,
+        REG_RESERVED_5 = 212,
+        REG_RESERVED_6 = 213,
+        REG_RESERVED_7 = 214,
+        REG_RESERVED_8 = 215,
+        REG_RESERVED_9 = 216,
+        REG_RESERVED_10 = 217,
+        REG_RESERVED_11 = 218,
+        REG_RESERVED_12 = 219,
+        REG_RESERVED_13 = 220,
+        REG_RESERVED_14 = 221,
+        REG_RESERVED_15 = 222,
+        REG_RESERVED_16 = 223,
+        REG_RESERVED_17 = 224,
+        REG_RESERVED_18 = 225,
+        REG_RESERVED_19 = 226,
+        REG_RESERVED_20 = 227,
+        REG_RESERVED_21 = 228,
+        REG_RESERVED_22 = 229,
+        REG_RESERVED_23 = 230,
+        REG_RESERVED_24 = 231,
+        REG_RESERVED_25 = 232,
+        REG_RESERVED_26 = 233,
+        REG_RESERVED_27 = 234,
+        REG_RESERVED_28 = 235,
+        REG_RESERVED_29 = 236,
+        REG_RESERVED_30 = 237,
+        REG_RESERVED_31 = 238,
+        REG_RESERVED_32 = 239,
+        REG_POS_HALF = 240,
+        REG_NEG_HALF = 241,
+        REG_POS_ONE = 242,
+        REG_NEG_ONE = 243,
+        REG_POS_TWO = 244,
+        REG_NEG_TWO = 245,
+        REG_POS_FOUR = 246,
+        REG_NEG_FOUR = 247,
+        REG_PI = 248,
+        /* NOTE: SDWA and SWDA both refer to sub d-word addressing */
+        REG_SRC_SWDA = 249,
+        REG_SRC_DPP = 250,
+        REG_VCCZ = 251,
+        REG_EXECZ = 252,
+        REG_SCC = 253,
+        REG_LDS_DIRECT = 254,
+        REG_SRC_LITERAL = 255,
+        REG_VGPR_MIN = 256,
+        REG_VGPR_MAX = 511
+    };
+
+    constexpr size_t MaxOperandDwords(16);
+    const int NumVecElemPerVecReg(64);
+    // op selector values 129 - 192 correspond to const values 1 - 64
+    const int NumPosConstRegs = REG_INT_CONST_POS_MAX
+        - REG_INT_CONST_POS_MIN + 1;
+    // op selector values 193 - 208 correspond to const values -1 - 16
+    const int NumNegConstRegs = REG_INT_CONST_NEG_MAX
+        - REG_INT_CONST_NEG_MIN + 1;
+    const int BITS_PER_BYTE = 8;
+    const int BITS_PER_WORD = 16;
+    const int MSB_PER_BYTE = (BITS_PER_BYTE - 1);
+    const int MSB_PER_WORD = (BITS_PER_WORD - 1);
+
+    // typedefs for the various sizes/types of scalar regs
+    typedef uint8_t ScalarRegU8;
+    typedef int8_t ScalarRegI8;
+    typedef uint16_t ScalarRegU16;
+    typedef int16_t ScalarRegI16;
+    typedef uint32_t ScalarRegU32;
+    typedef int32_t ScalarRegI32;
+    typedef float ScalarRegF32;
+    typedef uint64_t ScalarRegU64;
+    typedef int64_t ScalarRegI64;
+    typedef double ScalarRegF64;
+
+    // typedefs for the various sizes/types of vector reg elements
+    typedef uint8_t VecElemU8;
+    typedef int8_t VecElemI8;
+    typedef uint16_t VecElemU16;
+    typedef int16_t VecElemI16;
+    typedef uint32_t VecElemU32;
+    typedef int32_t VecElemI32;
+    typedef float VecElemF32;
+    typedef uint64_t VecElemU64;
+    typedef int64_t VecElemI64;
+    typedef double VecElemF64;
+
+    const int DWORDSize = sizeof(VecElemU32);
+    /**
+     * Size of a single-precision register in DWORDs.
+     */
+    const int RegSizeDWORDs = sizeof(VecElemU32) / DWORDSize;
+
+    // typedefs for the various sizes/types of vector regs
+    using VecRegU8 = ::VecRegT<VecElemU8, NumVecElemPerVecReg, false>;
+    using VecRegI8 = ::VecRegT<VecElemI8, NumVecElemPerVecReg, false>;
+    using VecRegU16 = ::VecRegT<VecElemU16, NumVecElemPerVecReg, false>;
+    using VecRegI16 = ::VecRegT<VecElemI16, NumVecElemPerVecReg, false>;
+    using VecRegU32 = ::VecRegT<VecElemU32, NumVecElemPerVecReg, false>;
+    using VecRegI32 = ::VecRegT<VecElemI32, NumVecElemPerVecReg, false>;
+    using VecRegF32 = ::VecRegT<VecElemF32, NumVecElemPerVecReg, false>;
+    using VecRegU64 = ::VecRegT<VecElemU64, NumVecElemPerVecReg, false>;
+    using VecRegI64 = ::VecRegT<VecElemI64, NumVecElemPerVecReg, false>;
+    using VecRegF64 = ::VecRegT<VecElemF64, NumVecElemPerVecReg, false>;
+    // non-writeable versions of vector regs
+    using ConstVecRegU8 = ::VecRegT<VecElemU8, NumVecElemPerVecReg, true>;
+    using ConstVecRegI8 = ::VecRegT<VecElemI8, NumVecElemPerVecReg, true>;
+    using ConstVecRegU16 = ::VecRegT<VecElemU16, NumVecElemPerVecReg, true>;
+    using ConstVecRegI16 = ::VecRegT<VecElemI16, NumVecElemPerVecReg, true>;
+    using ConstVecRegU32 = ::VecRegT<VecElemU32, NumVecElemPerVecReg, true>;
+    using ConstVecRegI32 = ::VecRegT<VecElemI32, NumVecElemPerVecReg, true>;
+    using ConstVecRegF32 = ::VecRegT<VecElemF32, NumVecElemPerVecReg, true>;
+    using ConstVecRegU64 = ::VecRegT<VecElemU64, NumVecElemPerVecReg, true>;
+    using ConstVecRegI64 = ::VecRegT<VecElemI64, NumVecElemPerVecReg, true>;
+    using ConstVecRegF64 = ::VecRegT<VecElemF64, NumVecElemPerVecReg, true>;
+
+    using VecRegContainerU8 = VecRegU8::Container;
+    using VecRegContainerU16 = VecRegU16::Container;
+    using VecRegContainerU32 = VecRegU32::Container;
+    using VecRegContainerU64 = VecRegU64::Container;
+
+    struct StatusReg
+    {
+        StatusReg() : SCC(0), SPI_PRIO(0), USER_PRIO(0), PRIV(0), TRAP_EN(0),
+            TTRACE_EN(0), EXPORT_RDY(0), EXECZ(0), VCCZ(0), IN_TG(0),
+            IN_BARRIER(0), HALT(0), TRAP(0), TTRACE_CU_EN(0), VALID(0),
+            ECC_ERR(0), SKIP_EXPORT(0), PERF_EN(0), COND_DBG_USER(0),
+            COND_DBG_SYS(0), ALLOW_REPLAY(0), INSTRUCTION_ATC(0), RESERVED(0),
+            MUST_EXPORT(0), RESERVED_1(0)
+        {
+        }
+
+        uint32_t SCC : 1;
+        uint32_t SPI_PRIO : 2;
+        uint32_t USER_PRIO : 2;
+        uint32_t PRIV : 1;
+        uint32_t TRAP_EN : 1;
+        uint32_t TTRACE_EN : 1;
+        uint32_t EXPORT_RDY : 1;
+        uint32_t EXECZ : 1;
+        uint32_t VCCZ : 1;
+        uint32_t IN_TG : 1;
+        uint32_t IN_BARRIER : 1;
+        uint32_t HALT : 1;
+        uint32_t TRAP : 1;
+        uint32_t TTRACE_CU_EN : 1;
+        uint32_t VALID : 1;
+        uint32_t ECC_ERR : 1;
+        uint32_t SKIP_EXPORT : 1;
+        uint32_t PERF_EN : 1;
+        uint32_t COND_DBG_USER : 1;
+        uint32_t COND_DBG_SYS : 1;
+        uint32_t ALLOW_REPLAY : 1;
+        uint32_t INSTRUCTION_ATC : 1;
+        uint32_t RESERVED : 3;
+        uint32_t MUST_EXPORT : 1;
+        uint32_t RESERVED_1 : 4;
+    };
+
+    std::string opSelectorToRegSym(int opIdx, int numRegs=0);
+    int opSelectorToRegIdx(int opIdx, int numScalarRegs);
+    bool isPosConstVal(int opIdx);
+    bool isNegConstVal(int opIdx);
+    bool isConstVal(int opIdx);
+    bool isLiteral(int opIdx);
+    bool isScalarReg(int opIdx);
+    bool isVectorReg(int opIdx);
+    bool isFlatScratchReg(int opIdx);
+    bool isExecMask(int opIdx);
+    bool isVccReg(int opIdx);
+} // namespace VegaISA
+
+#endif // __ARCH_VEGA_REGISTERS_HH__
--- a/src/arch/amdgpu/vega/gpu_types.hh
+++ b/src/arch/amdgpu/vega/gpu_types.hh
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_VEGA_GPU_TYPES_HH__
+#define __ARCH_VEGA_GPU_TYPES_HH__
+
+#include <cstdint>
+
+namespace VegaISA
+{
+    union InstFormat;
+
+    /**
+     * used to represnt a GPU inst in its raw format. VEGA
+     * instructions may be 32b or 64b, therefore we represent
+     * a raw inst with 64b to ensure that all of its inst data,
+     * including potential immediate values, may be represented
+     * in the worst case.
+     */
+    typedef uint64_t RawMachInst;
+
+    /**
+     * used to represent the encoding of a VEGA inst. each portion
+     * of a VEGA inst must be 1 DWORD (32b), so we use a pointer
+     * to InstFormat type (which is 32b). for the case in which we
+     * need multiple DWORDS to represnt a single inst, this pointer
+     * essentialy acts as an array of the DWORDs needed to represent
+     * the entire inst encoding.
+     */
+    typedef InstFormat *MachInst;
+
+} // namespace VegaISA
+
+#endif // __ARCH_VEGA_GPU_TYPES_HH__
--- a/src/arch/amdgpu/vega/insts/gpu_static_inst.cc
+++ b/src/arch/amdgpu/vega/insts/gpu_static_inst.cc
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/gpu_static_inst.hh"
+
+#include "arch/amdgpu/vega/gpu_decoder.hh"
+#include "arch/amdgpu/vega/insts/instructions.hh"
+#include "debug/GPUExec.hh"
+#include "gpu-compute/flexible_pool_manager.hh"
+#include "gpu-compute/shader.hh"
+
+namespace VegaISA
+{
+    VEGAGPUStaticInst::VEGAGPUStaticInst(const std::string &opcode)
+        : GPUStaticInst(opcode), _srcLiteral(0)
+    {
+    }
+
+    VEGAGPUStaticInst::~VEGAGPUStaticInst()
+    {
+    }
+
+    void
+    VEGAGPUStaticInst::panicUnimplemented() const
+    {
+        fatal("Encountered unimplemented VEGA instruction: %s\n", _opcode);
+    }
+} // namespace VegaISA
--- a/src/arch/amdgpu/vega/insts/gpu_static_inst.hh
+++ b/src/arch/amdgpu/vega/insts/gpu_static_inst.hh
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_VEGA_INSTS_GPU_STATIC_INST_HH__
+#define __ARCH_VEGA_INSTS_GPU_STATIC_INST_HH__
+
+#include "arch/amdgpu/vega/gpu_registers.hh"
+#include "arch/amdgpu/vega/operand.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/scalar_register_file.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+
+namespace VegaISA
+{
+    class VEGAGPUStaticInst : public GPUStaticInst
+    {
+      public:
+        VEGAGPUStaticInst(const std::string &opcode);
+        ~VEGAGPUStaticInst();
+
+        void generateDisassembly() override { disassembly = _opcode; }
+
+        bool
+        isFlatScratchRegister(int opIdx) override
+        {
+            return isFlatScratchReg(opIdx);
+        }
+
+        bool isScalarRegister(int opIdx) override { return false; }
+        bool isVectorRegister(int opIdx) override { return false; }
+        bool isSrcOperand(int opIdx) override { return false; }
+        bool isDstOperand(int opIdx) override { return false; }
+        int getOperandSize(int opIdx) override { return 0; }
+
+        int
+        getRegisterIndex(int opIdx, int num_scalar_regs) override
+        {
+            return 0;
+        }
+
+        /**
+          * Return the number of tokens needed by the coalescer. In VEGA there
+          * is generally one packet per memory request per lane generated. In
+          * HSAIL, the number of dest operands is used for loads and src
+          * operands for stores. This method should be overriden on a per-inst
+          * basis when this value differs.
+          */
+        int coalescerTokenCount() const override { return 1; }
+        ScalarRegU32 srcLiteral() const override { return _srcLiteral; }
+
+      protected:
+        void panicUnimplemented() const;
+
+        /**
+         * if the instruction has a src literal - an immediate
+         * value that is part of the instruction stream - we
+         * store that here
+         */
+        ScalarRegU32 _srcLiteral;
+    }; // class VEGAGPUStaticInst
+
+} // namespace VegaISA
+#endif //__ARCH_VEGA_INSTS_GPU_STATIC_INST_HH__
--- a/src/arch/amdgpu/vega/insts/inst_util.hh
+++ b/src/arch/amdgpu/vega/insts/inst_util.hh
@@ -0,0 +1,894 @@
+/*
+ * Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_VEGA_INSTS_INST_UTIL_HH__
+#define __ARCH_VEGA_INSTS_INST_UTIL_HH__
+
+#include <cmath>
+
+#include "arch/amdgpu/vega/gpu_registers.hh"
+
+// values for SDWA select operations
+enum SDWASelVals : int
+{
+    SDWA_BYTE_0 = 0, /* select data[7:0] */
+    SDWA_BYTE_1 = 1, /* select data[15:8] */
+    SDWA_BYTE_2 = 2, /* select data[23:16] */
+    SDWA_BYTE_3 = 3, /* select data[31:24] */
+    SDWA_WORD_0 = 4, /* select data[15:0] */
+    SDWA_WORD_1 = 5, /* select data[31:16] */
+    SDWA_DWORD  = 6  /* select data[31:0] */
+};
+
+// values for format of destination bits for SDWA operations
+enum SDWADstVals : int
+{
+    SDWA_UNUSED_PAD      = 0, /* Pad all unused bits with 0 */
+    SDWA_UNUSED_SEXT     = 1, /* Sign-extend upper bits; pad lower bits w/ 0 */
+    SDWA_UNUSED_PRESERVE = 2  /* select data[31:0] */
+};
+
+// values for DPP operations
+enum SqDPPVals : int
+{
+    SQ_DPP_QUAD_PERM_MAX   = 0xFF,
+    SQ_DPP_RESERVED        = 0x100,
+    SQ_DPP_ROW_SL1         = 0x101,
+    SQ_DPP_ROW_SL15        = 0x10F,
+    SQ_DPP_ROW_SR1         = 0x111,
+    SQ_DPP_ROW_SR15        = 0x11F,
+    SQ_DPP_ROW_RR1         = 0x121,
+    SQ_DPP_ROW_RR15        = 0x12F,
+    SQ_DPP_WF_SL1          = 0x130,
+    SQ_DPP_WF_RL1          = 0x134,
+    SQ_DPP_WF_SR1          = 0x138,
+    SQ_DPP_WF_RR1          = 0x13C,
+    SQ_DPP_ROW_MIRROR      = 0x140,
+    SQ_DPP_ROW_HALF_MIRROR = 0x141,
+    SQ_DPP_ROW_BCAST15     = 0x142,
+    SQ_DPP_ROW_BCAST31     = 0x143
+};
+static const int ROW_SIZE = 16; /* 16 registers per row */
+static const int NUM_BANKS = 4; /* 64 registers, 16/bank */
+
+namespace VegaISA
+{
+    template<typename T>
+    inline T
+    wholeQuadMode(T val)
+    {
+        T wqm = 0;
+        T mask = 0xF;
+
+        for (T bits = val; mask != 0; mask <<= 4)
+            if ((bits & mask) != 0)
+                wqm |= mask;
+
+        return wqm;
+    }
+
+    template<typename T>
+    inline T
+    quadMask(T val)
+    {
+        T qmsk = 0;
+        T mask = 0xF;
+        T qbit = 0x1;
+
+        for (T bits = val; mask != 0; mask <<= 4, qbit <<= 1) {
+            if (bits & mask) {
+                qmsk |= qbit;
+            }
+        }
+
+        return qmsk;
+    }
+
+    template<typename T>
+    inline ScalarRegI32
+    countZeroBits(T val)
+    {
+        ScalarRegI32 num_zeros
+            = std::numeric_limits<T>::digits - popCount(val);
+
+        return num_zeros;
+    }
+
+    template<typename T>
+    inline ScalarRegI32
+    findFirstZero(T val)
+    {
+        if (val == ~T(0)) {
+            return -1;
+        }
+
+        return findLsbSet(~val);
+    }
+
+    template<typename T>
+    inline ScalarRegI32
+    findFirstOne(T val)
+    {
+        if (!val) {
+            return -1;
+        }
+
+        return findLsbSet(val);
+    }
+
+    template<typename T>
+    inline ScalarRegI32
+    findFirstOneMsb(T val)
+    {
+        if (!val) {
+            return -1;
+        }
+
+        return findMsbSet(val);
+    }
+
+    template<typename T>
+    inline ScalarRegI32
+    countZeroBitsMsb(T val)
+    {
+        if (!val) {
+            return -1;
+        }
+
+        return std::numeric_limits<T>::digits - 1 - findMsbSet(val);
+    }
+
+    inline ScalarRegI32
+    firstOppositeSignBit(ScalarRegI32 val)
+    {
+        bool found(false);
+        bool sign_bit = (val & 0x80000000) != 0;
+        ScalarRegU32 tmp_val(0);
+        int count(0);
+
+        if (!val || val == -1) {
+            return -1;
+        }
+
+        for (int i = 0; i < std::numeric_limits<ScalarRegU32>::digits; ++i) {
+            tmp_val = val & (0x80000000 >> i);
+
+            if (!sign_bit) {
+                if (tmp_val) {
+                    found = true;
+                    break;
+                }
+            } else {
+                if (!tmp_val) {
+                    found = true;
+                    break;
+                }
+            }
+            ++count;
+        }
+
+        if (found) {
+            return count;
+        } else {
+            return -1;
+        }
+    }
+
+    inline ScalarRegI32
+    firstOppositeSignBit(ScalarRegI64 val)
+    {
+        bool found(false);
+        bool sign_bit = (val & 0x8000000000000000ULL) != 0;
+        ScalarRegU64 tmp_val(0);
+        int count(0);
+
+        if (!val || val == -1) {
+            return -1;
+        }
+
+        for (int i = 0; i < std::numeric_limits<ScalarRegU64>::digits; ++i) {
+            tmp_val = val & (0x8000000000000000ULL >> i);
+
+            if (!sign_bit) {
+                if (tmp_val) {
+                    found = true;
+                    break;
+                }
+            } else {
+                if (!tmp_val) {
+                    found = true;
+                    break;
+                }
+            }
+            ++count;
+        }
+
+        if (found) {
+            return count;
+        } else {
+            return -1;
+        }
+    }
+
+    template<typename T>
+    inline T
+    median(T val_0, T val_1, T val_2)
+    {
+        if (std::is_floating_point<T>::value) {
+            return std::fmax(std::fmin(val_0, val_1),
+                std::fmin(std::fmax(val_0, val_1), val_2));
+        } else {
+            return std::max(std::min(val_0, val_1),
+                std::min(std::max(val_0, val_1), val_2));
+        }
+    }
+
+    template <typename T>
+    inline T roundNearestEven(T val)
+    {
+        T int_part = 0;
+        T nearest_round = std::floor(val + 0.5);
+        if ((int)std::floor(val) % 2 == 0
+            && std::modf(std::abs(val), &int_part) == 0.5) {
+          nearest_round = nearest_round - 1;
+        }
+
+        return nearest_round;
+    }
+
+    inline VecElemU32
+    muladd(VecElemU64 &dst, VecElemU32 val_0, VecElemU32 val_1,
+        VecElemU64 val_2)
+    {
+        __uint128_t u0 = (__uint128_t)val_0;
+        __uint128_t u1 = (__uint128_t)val_1;
+        __uint128_t u2 = (__uint128_t)val_2;
+        __uint128_t result = u0 * u1 + u2;
+
+        dst = (VecElemU64)result;
+
+        return (VecElemU32)(result >> 64) ? 1 : 0;
+    }
+
+    inline VecElemU32
+    muladd(VecElemI64 &dst, VecElemI32 val_0, VecElemI32 val_1,
+        VecElemI64 val_2)
+    {
+        __int128_t u0 = (__int128_t)val_0;
+        __int128_t u1 = (__int128_t)val_1;
+        __int128_t u2 = (__int128_t)val_2;
+        __int128_t result = u0 * u1 + u2;
+
+        dst = (VecElemI64)result;
+
+        return (VecElemU32)(result >> 64) ? 1 : 0;
+    }
+
+    /**
+     * dppInstImpl is a helper function that performs the inputted operation
+     * on the inputted vector register lane.  The returned output lane
+     * represents the input lane given the destination lane and DPP_CTRL word.
+     *
+     * Currently the values are:
+     * 0x0 - 0xFF: full permute of four threads
+     * 0x100: reserved
+     * 0x101 - 0x10F: row shift right by 1-15 threads
+     * 0x111 - 0x11F: row shift right by 1-15 threads
+     * 0x121 - 0x12F: row shift right by 1-15 threads
+     * 0x130: wavefront left shift by 1 thread
+     * 0x134: wavefront left rotate by 1 thread
+     * 0x138: wavefront right shift by 1 thread
+     * 0x13C: wavefront right rotate by 1 thread
+     * 0x140: mirror threads within row
+     * 0x141: mirror threads within 1/2 row (8 threads)
+     * 0x142: broadcast 15th thread of each row to next row
+     * 0x143: broadcast thread 31 to rows 2 and 3
+     */
+    int dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum,
+                    int rowOffset, bool & outOfBounds)
+    {
+        // local variables
+        // newLane will be the same as the input lane unless swizzling happens
+        int newLane = currLane;
+        // for shift/rotate permutations; positive values are LEFT rotates
+        int count = 1;
+        int localRowOffset = rowOffset;
+        int localRowNum = rowNum;
+
+        if (dppCtrl <= SQ_DPP_QUAD_PERM_MAX) { // DPP_QUAD_PERM{00:FF}
+            int quadBase = (currLane & ~(3));
+            int quadPix = (currLane & 3);
+            quadPix = ((dppCtrl >> (2 * quadPix)) & 3);
+            newLane = (quadBase | quadPix);
+        } else if (dppCtrl == SQ_DPP_RESERVED) {
+            panic("ERROR: instruction using reserved DPP_CTRL value\n");
+        } else if ((dppCtrl >= SQ_DPP_ROW_SL1) &&
+                   (dppCtrl <= SQ_DPP_ROW_SL15)) { // DPP_ROW_SL{1:15}
+            count -= (dppCtrl - SQ_DPP_ROW_SL1 + 1);
+            if ((localRowOffset + count >= 0) &&
+                (localRowOffset + count < ROW_SIZE)) {
+                localRowOffset += count;
+                newLane = (rowNum | localRowOffset);
+            } else {
+                outOfBounds = true;
+            }
+        } else if ((dppCtrl >= SQ_DPP_ROW_SR1) &&
+                   (dppCtrl <= SQ_DPP_ROW_SR15)) { // DPP_ROW_SR{1:15}
+            count -= (dppCtrl - SQ_DPP_ROW_SR1 + 1);
+            if ((localRowOffset + count >= 0) &&
+                (localRowOffset + count < ROW_SIZE)) {
+                localRowOffset += count;
+                newLane = (rowNum | localRowOffset);
+            } else {
+                outOfBounds = true;
+            }
+        } else if ((dppCtrl >= SQ_DPP_ROW_RR1) &&
+                   (dppCtrl <= SQ_DPP_ROW_RR15)) { // DPP_ROW_RR{1:15}
+            count -= (dppCtrl - SQ_DPP_ROW_RR1 + 1);
+            localRowOffset = (localRowOffset + count + ROW_SIZE) % ROW_SIZE;
+            newLane = (rowNum | localRowOffset);
+        } else if (dppCtrl == SQ_DPP_WF_SL1) { // DPP_WF_SL1
+            count = 1;
+            if ((currLane >= 0) && (currLane < NumVecElemPerVecReg)) {
+                newLane += count;
+            } else {
+                outOfBounds = true;
+            }
+        } else if (dppCtrl == SQ_DPP_WF_RL1) { // DPP_WF_RL1
+            count = 1;
+            newLane = (currLane + count + NumVecElemPerVecReg) %
+                      NumVecElemPerVecReg;
+        } else if (dppCtrl == SQ_DPP_WF_SR1) { // DPP_WF_SR1
+            count = -1;
+            int currVal = (currLane + count);
+            if ((currVal >= 0) && (currVal < NumVecElemPerVecReg)) {
+                newLane += count;
+            } else {
+                outOfBounds = true;
+            }
+        } else if (dppCtrl == SQ_DPP_WF_RR1) { // DPP_WF_RR1
+            count = -1;
+            newLane = (currLane + count + NumVecElemPerVecReg) %
+                      NumVecElemPerVecReg;
+        } else if (dppCtrl == SQ_DPP_ROW_MIRROR) { // DPP_ROW_MIRROR
+            localRowOffset = (15 - localRowOffset);
+            newLane = (rowNum | localRowOffset);
+        } else if (dppCtrl == SQ_DPP_ROW_HALF_MIRROR) { // DPP_ROW_HALF_MIRROR
+            localRowNum = (currLane & -0x7);
+            localRowOffset = (currLane & 0x7);
+            localRowOffset = (7 - localRowNum);
+            newLane = (localRowNum | localRowOffset);
+        } else if (dppCtrl == SQ_DPP_ROW_BCAST15) { // DPP_ROW_BCAST15
+            count = 15;
+            if (currLane > count) {
+                newLane = (currLane & ~count) - 1;
+            }
+        } else if (dppCtrl == SQ_DPP_ROW_BCAST31) { // DPP_ROW_BCAST31
+            count = 31;
+            if (currLane > count) {
+                newLane = (currLane & ~count) - 1;
+            }
+        } else {
+            panic("Unimplemented DPP control operation: %d\n", dppCtrl);
+        }
+
+        return newLane;
+    }
+
+    /**
+     * processDPP is a helper function for implementing Data Parallel Primitive
+     * instructions.  This function may be called by many different VOP1
+     * instructions to do operations within a register.
+     */
+    template<typename T>
+    void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst,
+                    T & src0)
+    {
+        // local variables
+        SqDPPVals dppCtrl = (SqDPPVals)dppInst.DPP_CTRL;
+        int boundCtrl = dppInst.BOUND_CTRL;
+        int bankMask = dppInst.BANK_MASK;
+        int rowMask = dppInst.ROW_MASK;
+        // row, bank info to be calculated per lane
+        int rowNum = 0, bankNum = 0, rowOffset = 0;
+        // outLane will be the same as the input lane unless swizzling happens
+        int outLane = 0;
+        bool laneDisabled = false;
+        // flags used for determining if a lane should be written to/reset/etc.
+        bool outOfBounds = false, zeroSrc = false;
+        long long threadValid = 0;
+
+        /**
+         * STEP 1a: check if the absolute value (ABS) or negation (NEG) tags
+         * are set.  If so, do the appropriate action(s) on src0 and/or src1.
+         *
+         * NOTE: ABS takes priority over NEG.
+         */
+        if (dppInst.SRC0_NEG) {
+            src0.negModifier();
+        }
+
+        if (dppInst.SRC0_ABS) {
+            src0.absModifier();
+        }
+
+        // iterate over all register lanes, performing steps 2-4
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            threadValid = (0x1LL << lane);
+            /**
+             * STEP 2: check the row and bank mask values.  These determine
+             * which threads are enabled for the subsequent DPP_CTRL
+             * operations.
+             */
+            rowNum = (lane / ROW_SIZE);
+            rowOffset = (lane % ROW_SIZE);
+            bankNum = (rowOffset / NUM_BANKS);
+
+            if (((rowMask & (0x1 << rowNum)) == 0)   /* row mask */   ||
+                ((bankMask & (0x1 << bankNum)) == 0) /* bank mask */) {
+                laneDisabled = true;
+                continue;
+            }
+
+            /**
+             * STEP 4: Handle the potential values of DPP_CTRL:
+             * 0x0 - 0xFF: full permute of four threads
+             * 0x100: reserved
+             * 0x101 - 0x10F: row shift right by 1-15 threads
+             * 0x111 - 0x11F: row shift right by 1-15 threads
+             * 0x121 - 0x12F: row shift right by 1-15 threads
+             * 0x130: wavefront left shift by 1 thread
+             * 0x134: wavefront left rotate by 1 thread
+             * 0x138: wavefront right shift by 1 thread
+             * 0x13C: wavefront right rotate by 1 thread
+             * 0x140: mirror threads within row
+             * 0x141: mirror threads within 1/2 row (8 threads)
+             * 0x142: broadcast 15th thread of each row to next row
+             * 0x143: broadcast thread 31 to rows 2 and 3
+             */
+            if (!laneDisabled) {
+                outLane = dppInstImpl(dppCtrl, lane, rowNum, rowOffset,
+                                      outOfBounds);
+            }
+
+            /**
+             * STEP 4: Implement bound control for disabled threads.  If thread
+             * is disabled but boundCtrl is set, then we need to set the source
+             * data to 0 (i.e., set this lane to 0).
+             */
+            if (laneDisabled) {
+                threadValid = 0;
+            } else if (outOfBounds) {
+                if (boundCtrl == 1) {
+                    zeroSrc = true;
+                } else {
+                    threadValid = 0;
+                }
+            } else if (!gpuDynInst->exec_mask[lane]) {
+                if (boundCtrl == 1) {
+                    zeroSrc = true;
+                } else {
+                    threadValid = 0;
+                }
+            }
+
+            if (threadValid != 0 && !outOfBounds && !zeroSrc) {
+                assert(!laneDisabled);
+                src0[outLane] = src0[lane];
+            } else if (zeroSrc) {
+                src0[lane] = 0;
+            }
+
+            // reset for next iteration
+            laneDisabled = false;
+        }
+    }
+
+    /**
+     * processDPP is a helper function for implementing Data Parallel Primitive
+     * instructions.  This function may be called by many different
+     * VOP2/VOPC instructions to do operations within a register.
+     */
+    template<typename T>
+    void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst,
+                    T & src0, T & src1)
+    {
+        /**
+         * STEP 1b: check if the absolute value (ABS) or negation (NEG) tags
+         * are set.  If so, do the appropriate action(s) on src0 and/or src1.
+         *
+         * NOTE: ABS takes priority over NEG.
+         */
+        if (dppInst.SRC1_NEG) {
+            src1.negModifier();
+        }
+
+        if (dppInst.SRC1_ABS) {
+            src1.absModifier();
+        }
+
+        // Since only difference for VOP1 and VOP2/VOPC instructions is SRC1,
+        // which is only used for negation/absolute value, call other version
+        // to do everything else.
+        processDPP(gpuDynInst, dppInst, src0);
+    }
+
+    /**
+     * sdwaInstSrcImpl_helper contains the per-lane code for selecting the
+     * appropriate bytes/words of the lane and doing the appropriate
+     * masking/padding/sign extending.  It returns the value after these
+     * operations are done on it.
+     */
+    template<typename T>
+    T sdwaInstSrcImpl_helper(T currOperVal, const T origOperVal,
+                             const SDWASelVals sel, const bool signExt)
+    {
+        // local variables
+        int low_bit = 0, high_bit = 0;
+        bool signExt_local = signExt;
+        T retVal = 0;
+
+        // if we're preserving all of the bits, then we can immediately return
+        if (sel == SDWA_DWORD) {
+            return currOperVal;
+        }
+
+        if (sel < SDWA_WORD_0) { // we are selecting 1 byte
+            /*
+              Process byte 0 first.  This code eiter selects the original bits
+              of byte 0, or makes the bits of the selected byte be byte 0 (and
+              next either sign extends or zero's out upper bits).
+            */
+            low_bit = (sel * VegaISA::BITS_PER_BYTE);
+            high_bit = low_bit + VegaISA::MSB_PER_BYTE;
+            retVal = bits(currOperVal, high_bit, low_bit);
+
+            // make sure update propagated, since used next
+            panic_if(bits(retVal, VegaISA::MSB_PER_BYTE) !=
+                     bits(origOperVal, high_bit),
+                     "ERROR: SDWA byte update not propagated: retVal: %d, "
+                     "orig: %d\n", bits(retVal, VegaISA::MSB_PER_BYTE),
+                     bits(origOperVal, high_bit));
+            // sign extended value depends on upper-most bit of the new byte 0
+            signExt_local = (signExt &&
+                             (bits(retVal, VegaISA::MSB_PER_BYTE, 0) & 0x80));
+
+            // process all other bytes -- if sign extending, make them 1, else
+            // all 0's so leave as is
+            if (signExt_local) {
+                retVal = (uint32_t)sext<VegaISA::MSB_PER_BYTE>(retVal);
+            }
+        } else if (sel < SDWA_DWORD) { // we are selecting 1 word
+            /*
+              Process word 0 first.  This code eiter selects the original bits
+              of word 0, or makes the bits of the selected word be word 0 (and
+              next either sign extends or zero's out upper bits).
+            */
+            low_bit = (sel & 1) * VegaISA::BITS_PER_WORD;
+            high_bit = low_bit + VegaISA::MSB_PER_WORD;
+            retVal = bits(currOperVal, high_bit, low_bit);
+
+            // make sure update propagated, since used next
+            panic_if(bits(retVal, VegaISA::MSB_PER_WORD) !=
+                     bits(origOperVal, high_bit),
+                     "ERROR: SDWA word update not propagated: retVal: %d, "
+                     "orig: %d\n",
+                     bits(retVal, VegaISA::MSB_PER_WORD),
+                     bits(origOperVal, high_bit));
+            // sign extended value depends on upper-most bit of the new word 0
+            signExt_local = (signExt &&
+                             (bits(retVal, VegaISA::MSB_PER_WORD, 0) &
+                              0x8000));
+
+            // process other word -- if sign extending, make them 1, else all
+            // 0's so leave as is
+            if (signExt_local) {
+                retVal = (uint32_t)sext<VegaISA::MSB_PER_WORD>(retVal);
+            }
+        } else {
+            assert(sel != SDWA_DWORD); // should have returned earlier
+            panic("Unimplemented SDWA select operation: %d\n", sel);
+        }
+
+        return retVal;
+    }
+
+
+    /**
+     * sdwaInstSrcImpl is a helper function that selects the appropriate
+     * bits/bytes for each lane of the inputted source operand of an SDWA
+     * instruction, does the appropriate masking/padding/sign extending for the
+     * non-selected bits/bytes, and updates the operands values with the
+     * resultant value.
+     *
+     * The desired behavior is:
+     *   1.  Select the appropriate bits/bytes based on sel:
+     *       0 (SDWA_BYTE_0): select data[7:0]
+     *       1 (SDWA_BYTE_1): select data[15:8]
+     *       2 (SDWA_BYTE_2): select data[23:16]
+     *       3 (SDWA_BYTE_3): select data[31:24]
+     *       4 (SDWA_WORD_0): select data[15:0]
+     *       5 (SDWA_WORD_1): select data[31:16]
+     *       6 (SDWA_DWORD): select data[31:0]
+     *   2.  if sign extend is set, then sign extend the value
+     */
+    template<typename T>
+    void sdwaInstSrcImpl(T & currOper, T & origCurrOper,
+                         const SDWASelVals sel, const bool signExt)
+    {
+        // iterate over all lanes, setting appropriate, selected value
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            currOper[lane] = sdwaInstSrcImpl_helper(currOper[lane],
+                                                    origCurrOper[lane], sel,
+                                                    signExt);
+        }
+    }
+
+
+    /**
+     * sdwaInstDstImpl_helper contains the per-lane code for selecting the
+     * appropriate bytes/words of the lane and doing the appropriate
+     * masking/padding/sign extending.  It returns the value after these
+     * operations are done on it.
+     */
+    template<typename T>
+    T sdwaInstDstImpl_helper(T currDstVal, const T origDstVal,
+                             const bool clamp, const SDWASelVals sel,
+                             const SDWADstVals unusedBits_format)
+    {
+        // local variables
+        int low_bit = 0, high_bit = 0;
+        bool signExt = (unusedBits_format == SDWA_UNUSED_SEXT);
+        //bool pad = (unusedBits_format == SDWA_UNUSED_PAD);
+        bool preserve = (unusedBits_format == SDWA_UNUSED_PRESERVE);
+        T retVal = 0, origBits_thisByte = 0, currBits_thisByte = 0,
+          origBits_thisWord = 0, currBits_thisWord = 0, newBits = 0;
+
+        // if we're preserving all of the bits, then we can immediately return
+        if (unusedBits_format == SDWA_UNUSED_PRESERVE) {
+            assert(sel == SDWA_DWORD);
+            return currDstVal;
+        } else if (sel == SDWA_DWORD) {
+            // NOTE: users may set the unused bits variable to anything in this
+            // scenario, because it will be ignored
+            return currDstVal;
+        }
+
+        if (sel < SDWA_WORD_0) { // we are selecting 1 byte
+            // if we sign extended depends on upper-most bit of byte 0
+            signExt = (signExt &&
+                       (bits(currDstVal, VegaISA::MSB_PER_WORD, 0) & 0x80));
+
+            for (int byte = 0; byte < 4; ++byte) {
+                low_bit = byte * VegaISA::BITS_PER_BYTE;
+                high_bit = low_bit + VegaISA::MSB_PER_BYTE;
+                /*
+                  Options:
+                    1.  byte == sel: we are keeping all bits in this byte
+                    2.  preserve is set: keep this byte as is because the
+                    output preserve flag is set
+                    3.  byte > sel && signExt: we're sign extending and
+                    this byte is one of the bytes we need to sign extend
+                */
+                origBits_thisByte = bits(origDstVal, high_bit, low_bit);
+                currBits_thisByte = bits(currDstVal, high_bit, low_bit);
+                newBits = ((byte == sel) ? origBits_thisByte :
+                           ((preserve) ? currBits_thisByte :
+                            (((byte > sel) && signExt) ? 0xff : 0)));
+                retVal = insertBits(retVal, high_bit, low_bit, newBits);
+            }
+        } else if (sel < SDWA_DWORD) { // we are selecting 1 word
+            low_bit = 0;
+            high_bit = low_bit + VegaISA::MSB_PER_WORD;
+            // if we sign extended depends on upper-most bit of word 0
+            signExt = (signExt &&
+                       (bits(currDstVal, high_bit, low_bit) & 0x8000));
+
+            for (int word = 0; word < 2; ++word) {
+                low_bit = word * VegaISA::BITS_PER_WORD;
+                high_bit = low_bit + VegaISA::MSB_PER_WORD;
+                /*
+                  Options:
+                    1.  word == sel & 1: we are keeping all bits in this word
+                    2.  preserve is set: keep this word as is because the
+                    output preserve flag is set
+                    3.  word > (sel & 1) && signExt: we're sign extending and
+                    this word is one of the words we need to sign extend
+                */
+                origBits_thisWord = bits(origDstVal, high_bit, low_bit);
+                currBits_thisWord = bits(currDstVal, high_bit, low_bit);
+                newBits = ((word == (sel & 0x1)) ? origBits_thisWord :
+                           ((preserve) ? currBits_thisWord :
+                            (((word > (sel & 0x1)) && signExt) ? 0xffff : 0)));
+                retVal = insertBits(retVal, high_bit, low_bit, newBits);
+            }
+        } else {
+            assert(sel != SDWA_DWORD); // should have returned earlier
+            panic("Unimplemented SDWA select operation: %d\n", sel);
+        }
+
+        return retVal;
+    }
+
+
+    /**
+     * sdwaInstDestImpl is a helper function that selects the appropriate
+     * bits/bytes for the inputted dest operand of an SDWA instruction, does
+     * the appropriate masking/padding/sign extending for the non-selected
+     * bits/bytes, and updates the operands values with the resultant value.
+     *
+     * The desired behavior is:
+     *   1.  Select the appropriate bits/bytes based on sel:
+     *       0 (SDWA_BYTE_0): select data[7:0]
+     *       1 (SDWA_BYTE_1): select data[15:8]
+     *       2 (SDWA_BYTE_2): select data[23:16]
+     *       3 (SDWA_BYTE_3): select data[31:24]
+     *       4 (SDWA_WORD_0): select data[15:0]
+     *       5 (SDWA_WORD_1): select data[31:16]
+     *       6 (SDWA_DWORD): select data[31:0]
+     *   2.  either pad, sign extend, or select all bits based on the value of
+     *   unusedBits_format:
+     *       0 (SDWA_UNUSED_PAD): pad all unused bits with 0
+     *       1 (SDWA_UNUSED_SEXT): sign-extend upper bits; pad lower bits w/ 0
+     *       2 (SDWA_UNUSED_PRESERVE): select data[31:0]
+     */
+    template<typename T>
+    void sdwaInstDstImpl(T & dstOper, T & origDstOper, const bool clamp,
+                         const SDWASelVals sel,
+                         const SDWADstVals unusedBits_format)
+    {
+        // iterate over all lanes, setting appropriate, selected value
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            dstOper[lane] = sdwaInstDstImpl_helper(dstOper[lane],
+                                                   origDstOper[lane], clamp,
+                                                   sel, unusedBits_format);
+        }
+    }
+
+
+    /**
+     * processSDWA_srcHelper is a helper function for implementing sub d-word
+     * addressing instructions for the src operands.  This function may be
+     * called by many different VOP1/VOP2/VOPC instructions to do operations
+     * within a register.  This function is also agnostic of which operand it
+     * is operating on, so that it can be called for any src operand.
+     */
+    template<typename T>
+    void processSDWA_src_helper(T & currSrc, T & origCurrSrc,
+                                const SDWASelVals src_sel,
+                                const bool src_signExt, const bool src_abs,
+                                const bool src_neg)
+    {
+        /**
+         * STEP 1: check if the absolute value (ABS) or negation (NEG) tags
+         * are set.  If so, do the appropriate action(s) on the src operand.
+         *
+         * NOTE: According to the CSim implementation, ABS takes priority over
+         * NEG.
+         */
+        if (src_neg) {
+            currSrc.negModifier();
+        }
+
+        if (src_abs) {
+            currSrc.absModifier();
+        }
+
+        /**
+         * STEP 2: select the appropriate bits for each lane of source operand.
+         */
+        sdwaInstSrcImpl(currSrc, origCurrSrc, src_sel, src_signExt);
+    }
+
+
+    /**
+     * processSDWA_src is a helper function for implementing sub d-word
+     * addressing instructions for the src operands.  This function may be
+     * called by many different VOP1 instructions to do operations within a
+     * register.  processSDWA_dst is called after the math, while
+     * processSDWA_src is called before the math.
+     */
+    template<typename T>
+    void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0)
+    {
+        // local variables
+        const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
+        const bool src0_signExt = sdwaInst.SRC0_SEXT;
+        const bool src0_neg = sdwaInst.SRC0_NEG;
+        const bool src0_abs = sdwaInst.SRC0_ABS;
+
+        // NOTE: difference between VOP1 and VOP2/VOPC is that there is no src1
+        // operand.  So ensure that SRC1 fields are not set, then call helper
+        // function only on src0.
+        assert(!sdwaInst.SRC1_SEXT);
+        assert(!sdwaInst.SRC1_NEG);
+        assert(!sdwaInst.SRC1_ABS);
+
+        processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt,
+                               src0_abs, src0_neg);
+    }
+
+
+    /**
+     * processSDWA_src is a helper function for implementing sub d-word
+     * addressing instructions.  This function may be called by many different
+     * VOP2/VOPC instructions to do operations within a register.
+     * processSDWA_dst is called after the math, while processSDWA_src is
+     * called before the math.
+     */
+    template<typename T>
+    void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0,
+                         T & src1, T & origSrc1)
+    {
+        // local variables
+        const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
+        const bool src0_signExt = sdwaInst.SRC0_SEXT;
+        const bool src0_neg = sdwaInst.SRC0_NEG;
+        const bool src0_abs = sdwaInst.SRC0_ABS;
+        const SDWASelVals src1_sel = (SDWASelVals)sdwaInst.SRC1_SEL;
+        const bool src1_signExt = sdwaInst.SRC1_SEXT;
+        const bool src1_neg = sdwaInst.SRC1_NEG;
+        const bool src1_abs = sdwaInst.SRC1_ABS;
+
+        processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt,
+                               src0_abs, src0_neg);
+        processSDWA_src_helper(src1, origSrc1, src1_sel, src1_signExt,
+                               src1_abs, src1_neg);
+    }
+
+
+    /**
+     * processSDWA_dst is a helper function for implementing sub d-word
+     * addressing instructions for the dst operand.  This function may be
+     * called by many different VOP1/VOP2/VOPC instructions to do operations
+     * within a register.  processSDWA_dst is called after the math, while
+     * processSDWA_src is called before the math.
+     */
+    template<typename T>
+    void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T & dst, T & origDst)
+    {
+        // local variables
+        const SDWADstVals dst_unusedBits_format =
+            (SDWADstVals)sdwaInst.DST_UNUSED;
+        const SDWASelVals dst_sel = (SDWASelVals)sdwaInst.DST_SEL;
+        const bool clamp = sdwaInst.CLAMP;
+
+        /**
+         * STEP 1: select the appropriate bits for dst and pad/sign-extend as
+         * appropriate.
+         */
+        sdwaInstDstImpl(dst, origDst, clamp, dst_sel, dst_unusedBits_format);
+    }
+} // namespace VegaISA
+
+#endif // __ARCH_VEGA_INSTS_INST_UTIL_HH__
--- a/src/arch/amdgpu/vega/insts/instructions.cc
+++ b/src/arch/amdgpu/vega/insts/instructions.cc
--- a/src/arch/amdgpu/vega/insts/instructions.hh
+++ b/src/arch/amdgpu/vega/insts/instructions.hh
--- a/src/arch/amdgpu/vega/insts/op_encodings.cc
+++ b/src/arch/amdgpu/vega/insts/op_encodings.cc
--- a/src/arch/amdgpu/vega/insts/op_encodings.hh
+++ b/src/arch/amdgpu/vega/insts/op_encodings.hh
@@ -0,0 +1,834 @@
+/*
+ * Copyright (c) 2016-2021 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_VEGA_INSTS_OP_ENCODINGS_HH__
+#define __ARCH_VEGA_INSTS_OP_ENCODINGS_HH__
+
+#include "arch/amdgpu/vega/gpu_decoder.hh"
+#include "arch/amdgpu/vega/gpu_mem_helpers.hh"
+#include "arch/amdgpu/vega/insts/gpu_static_inst.hh"
+#include "arch/amdgpu/vega/operand.hh"
+#include "debug/GPUExec.hh"
+#include "debug/VEGA.hh"
+#include "mem/ruby/system/RubySystem.hh"
+
+namespace VegaISA
+{
+    struct BufferRsrcDescriptor
+    {
+        uint64_t baseAddr : 48;
+        uint32_t stride : 14;
+        uint32_t cacheSwizzle : 1;
+        uint32_t swizzleEn : 1;
+        uint32_t numRecords : 32;
+        uint32_t dstSelX : 3;
+        uint32_t dstSelY : 3;
+        uint32_t dstSelZ : 3;
+        uint32_t dstSelW : 3;
+        uint32_t numFmt : 3;
+        uint32_t dataFmt : 4;
+        uint32_t elemSize : 2;
+        uint32_t idxStride : 2;
+        uint32_t addTidEn : 1;
+        uint32_t atc : 1;
+        uint32_t hashEn : 1;
+        uint32_t heap : 1;
+        uint32_t mType : 3;
+        uint32_t type : 2;
+    };
+
+    // --- purely virtual instruction classes ---
+
+    class Inst_SOP2 : public VEGAGPUStaticInst
+    {
+      public:
+        Inst_SOP2(InFmt_SOP2*, const std::string &opcode);
+
+        int instSize() const override;
+        void generateDisassembly() override;
+
+        bool isScalarRegister(int opIdx) override;
+        bool isVectorRegister(int opIdx) override;
+        int getRegisterIndex(int opIdx, int num_scalar_regs) override;
+
+      protected:
+        // first instruction DWORD
+        InFmt_SOP2 instData;
+        // possible second DWORD
+        InstFormat extData;
+        uint32_t varSize;
+
+      private:
+        bool hasSecondDword(InFmt_SOP2 *);
+    }; // Inst_SOP2
+
+    class Inst_SOPK : public VEGAGPUStaticInst
+    {
+      public:
+        Inst_SOPK(InFmt_SOPK*, const std::string &opcode);
+        ~Inst_SOPK();
+
+        int instSize() const override;
+        void generateDisassembly() override;
+
+        bool isScalarRegister(int opIdx) override;
+        bool isVectorRegister(int opIdx) override;
+        int getRegisterIndex(int opIdx, int num_scalar_regs) override;
+
+      protected:
+        // first instruction DWORD
+        InFmt_SOPK instData;
+        // possible second DWORD
+        InstFormat extData;
+        uint32_t varSize;
+
+      private:
+        bool hasSecondDword(InFmt_SOPK *);
+    }; // Inst_SOPK
+
+    class Inst_SOP1 : public VEGAGPUStaticInst
+    {
+      public:
+        Inst_SOP1(InFmt_SOP1*, const std::string &opcode);
+        ~Inst_SOP1();
+
+        int instSize() const override;
+        void generateDisassembly() override;
+
+        bool isScalarRegister(int opIdx) override;
+        bool isVectorRegister(int opIdx) override;
+        int getRegisterIndex(int opIdx, int num_scalar_regs) override;
+
+      protected:
+        // first instruction DWORD
+        InFmt_SOP1 instData;
+        // possible second DWORD
+        InstFormat extData;
+        uint32_t varSize;
+
+      private:
+        bool hasSecondDword(InFmt_SOP1 *);
+    }; // Inst_SOP1
+
+    class Inst_SOPC : public VEGAGPUStaticInst
+    {
+      public:
+        Inst_SOPC(InFmt_SOPC*, const std::string &opcode);
+        ~Inst_SOPC();
+
+        int instSize() const override;
+        void generateDisassembly() override;
+
+        bool isScalarRegister(int opIdx) override;
+        bool isVectorRegister(int opIdx) override;
+        int getRegisterIndex(int opIdx, int num_scalar_regs) override;
+
+      protected:
+        // first instruction DWORD
+        InFmt_SOPC instData;
+        // possible second DWORD
+        InstFormat extData;
+        uint32_t varSize;
+
+      private:
+        bool hasSecondDword(InFmt_SOPC *);
+    }; // Inst_SOPC
+
+    class Inst_SOPP : public VEGAGPUStaticInst
+    {
+      public:
+        Inst_SOPP(InFmt_SOPP*, const std::string &opcode);
+        ~Inst_SOPP();
+
+        int instSize() const override;
+        void generateDisassembly() override;
+
+        bool isScalarRegister(int opIdx) override;
+        bool isVectorRegister(int opIdx) override;
+        int getRegisterIndex(int opIdx, int num_scalar_regs) override;
+
+      protected:
+        // first instruction DWORD
+        InFmt_SOPP instData;
+    }; // Inst_SOPP
+
+    class Inst_SMEM : public VEGAGPUStaticInst
+    {
+      public:
+        Inst_SMEM(InFmt_SMEM*, const std::string &opcode);
+        ~Inst_SMEM();
+
+        int instSize() const override;
+        void generateDisassembly() override;
+
+        bool isScalarRegister(int opIdx) override;
+        bool isVectorRegister(int opIdx) override;
+        int getRegisterIndex(int opIdx, int num_scalar_regs) override;
+
+      protected:
+        /**
+         * initiate a memory read access for N dwords
+         */
+        template<int N>
+        void
+        initMemRead(GPUDynInstPtr gpuDynInst)
+        {
+            initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,
+                                                    MemCmd::ReadReq);
+        }
+
+        /**
+         * initiate a memory write access for N dwords
+         */
+        template<int N>
+        void
+        initMemWrite(GPUDynInstPtr gpuDynInst)
+        {
+            initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,
+                                                    MemCmd::WriteReq);
+        }
+
+        /**
+         * For normal s_load_dword/s_store_dword instruction addresses.
+         */
+        void
+        calcAddr(GPUDynInstPtr gpu_dyn_inst, ConstScalarOperandU64 &addr,
+                 ScalarRegU32 offset)
+        {
+            Addr vaddr = ((addr.rawData() + offset) & ~0x3);
+            gpu_dyn_inst->scalarAddr = vaddr;
+        }
+
+        /**
+         * For s_buffer_load_dword/s_buffer_store_dword instruction addresses.
+         * The s_buffer instructions use the same buffer resource descriptor
+         * as the MUBUF instructions.
+         */
+        void
+        calcAddr(GPUDynInstPtr gpu_dyn_inst,
+                 ConstScalarOperandU128 &s_rsrc_desc, ScalarRegU32 offset)
+        {
+            BufferRsrcDescriptor rsrc_desc;
+            ScalarRegU32 clamped_offset(offset);
+            std::memcpy((void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(),
+                        sizeof(BufferRsrcDescriptor));
+
+            /**
+             * The address is clamped if:
+             *     Stride is zero: clamp if offset >= num_records
+             *     Stride is non-zero: clamp if offset > (stride * num_records)
+             */
+            if (!rsrc_desc.stride && offset >= rsrc_desc.numRecords) {
+                clamped_offset = rsrc_desc.numRecords;
+            } else if (rsrc_desc.stride && offset
+                       > (rsrc_desc.stride * rsrc_desc.numRecords)) {
+                clamped_offset = (rsrc_desc.stride * rsrc_desc.numRecords);
+            }
+
+            Addr vaddr = ((rsrc_desc.baseAddr + clamped_offset) & ~0x3);
+            gpu_dyn_inst->scalarAddr = vaddr;
+        }
+
+        // first instruction DWORD
+        InFmt_SMEM instData;
+        // second instruction DWORD
+        InFmt_SMEM_1 extData;
+    }; // Inst_SMEM
+
+    class Inst_VOP2 : public VEGAGPUStaticInst
+    {
+      public:
+        Inst_VOP2(InFmt_VOP2*, const std::string &opcode);
+        ~Inst_VOP2();
+
+        int instSize() const override;
+        void generateDisassembly() override;
+
+        bool isScalarRegister(int opIdx) override;
+        bool isVectorRegister(int opIdx) override;
+        int getRegisterIndex(int opIdx, int num_scalar_regs) override;
+
+      protected:
+        // first instruction DWORD
+        InFmt_VOP2 instData;
+        // possible second DWORD
+        InstFormat extData;
+        uint32_t varSize;
+
+      private:
+        bool hasSecondDword(InFmt_VOP2 *);
+    }; // Inst_VOP2
+
+    class Inst_VOP1 : public VEGAGPUStaticInst
+    {
+      public:
+        Inst_VOP1(InFmt_VOP1*, const std::string &opcode);
+        ~Inst_VOP1();
+
+        int instSize() const override;
+        void generateDisassembly() override;
+
+        bool isScalarRegister(int opIdx) override;
+        bool isVectorRegister(int opIdx) override;
+        int getRegisterIndex(int opIdx, int num_scalar_regs) override;
+
+      protected:
+        // first instruction DWORD
+        InFmt_VOP1 instData;
+        // possible second DWORD
+        InstFormat extData;
+        uint32_t varSize;
+
+      private:
+        bool hasSecondDword(InFmt_VOP1 *);
+    }; // Inst_VOP1
+
+    class Inst_VOPC : public VEGAGPUStaticInst
+    {
+      public:
+        Inst_VOPC(InFmt_VOPC*, const std::string &opcode);
+        ~Inst_VOPC();
+
+        int instSize() const override;
+        void generateDisassembly() override;
+
+        bool isScalarRegister(int opIdx) override;
+        bool isVectorRegister(int opIdx) override;
+        int getRegisterIndex(int opIdx, int num_scalar_regs) override;
+
+      protected:
+        // first instruction DWORD
+        InFmt_VOPC instData;
+        // possible second DWORD
+        InstFormat extData;
+        uint32_t varSize;
+
+      private:
+        bool hasSecondDword(InFmt_VOPC *);
+    }; // Inst_VOPC
+
+    class Inst_VINTRP : public VEGAGPUStaticInst
+    {
+      public:
+        Inst_VINTRP(InFmt_VINTRP*, const std::string &opcode);
+        ~Inst_VINTRP();
+
+        int instSize() const override;
+
+      protected:
+        // first instruction DWORD
+        InFmt_VINTRP instData;
+    }; // Inst_VINTRP
+
+    class Inst_VOP3 : public VEGAGPUStaticInst
+    {
+      public:
+        Inst_VOP3(InFmt_VOP3*, const std::string &opcode, bool sgpr_dst);
+        ~Inst_VOP3();
+
+        int instSize() const override;
+        void generateDisassembly() override;
+
+        bool isScalarRegister(int opIdx) override;
+        bool isVectorRegister(int opIdx) override;
+        int getRegisterIndex(int opIdx, int num_scalar_regs) override;
+
+      protected:
+        // first instruction DWORD
+        InFmt_VOP3 instData;
+        // second instruction DWORD
+        InFmt_VOP3_1 extData;
+
+      private:
+        bool hasSecondDword(InFmt_VOP3 *);
+        /**
+         * the v_cmp and readlane instructions in the VOP3
+         * encoding are unique because they are the only
+         * instructions that use the VDST field to specify
+         * a scalar register destination. for VOP3::V_CMP insts
+         * VDST specifies the arbitrary SGPR pair used to write
+         * VCC. for V_READLANE VDST specifies the SGPR to return
+         * the value of the selected lane in the source VGPR
+         * from which we are reading.
+         */
+        const bool sgprDst;
+    }; // Inst_VOP3
+
+    class Inst_VOP3_SDST_ENC : public VEGAGPUStaticInst
+    {
+      public:
+        Inst_VOP3_SDST_ENC(InFmt_VOP3_SDST_ENC*, const std::string &opcode);
+        ~Inst_VOP3_SDST_ENC();
+
+        int instSize() const override;
+        void generateDisassembly() override;
+
+        bool isScalarRegister(int opIdx) override;
+        bool isVectorRegister(int opIdx) override;
+        int getRegisterIndex(int opIdx, int num_scalar_regs) override;
+
+      protected:
+        // first instruction DWORD
+        InFmt_VOP3_SDST_ENC instData;
+        // second instruction DWORD
+        InFmt_VOP3_1 extData;
+
+      private:
+        bool hasSecondDword(InFmt_VOP3_SDST_ENC *);
+    }; // Inst_VOP3_SDST_ENC
+
+    class Inst_DS : public VEGAGPUStaticInst
+    {
+      public:
+        Inst_DS(InFmt_DS*, const std::string &opcode);
+        ~Inst_DS();
+
+        int instSize() const override;
+        void generateDisassembly() override;
+
+        bool isScalarRegister(int opIdx) override;
+        bool isVectorRegister(int opIdx) override;
+        int getRegisterIndex(int opIdx, int num_scalar_regs) override;
+
+      protected:
+        template<typename T>
+        void
+        initMemRead(GPUDynInstPtr gpuDynInst, Addr offset)
+        {
+            Wavefront *wf = gpuDynInst->wavefront();
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (gpuDynInst->exec_mask[lane]) {
+                    Addr vaddr = gpuDynInst->addr[lane] + offset;
+
+                    (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
+                        = wf->ldsChunk->read<T>(vaddr);
+                }
+            }
+        }
+
+        template<typename T>
+        void
+        initDualMemRead(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
+        {
+            Wavefront *wf = gpuDynInst->wavefront();
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (gpuDynInst->exec_mask[lane]) {
+                    Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
+                    Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
+
+                    (reinterpret_cast<T*>(gpuDynInst->d_data))[lane * 2]
+                        = wf->ldsChunk->read<T>(vaddr0);
+                    (reinterpret_cast<T*>(gpuDynInst->d_data))[lane * 2 + 1]
+                        = wf->ldsChunk->read<T>(vaddr1);
+                }
+            }
+        }
+
+        template<typename T>
+        void
+        initMemWrite(GPUDynInstPtr gpuDynInst, Addr offset)
+        {
+            Wavefront *wf = gpuDynInst->wavefront();
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (gpuDynInst->exec_mask[lane]) {
+                    Addr vaddr = gpuDynInst->addr[lane] + offset;
+                    wf->ldsChunk->write<T>(vaddr,
+                        (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]);
+                }
+            }
+        }
+
+        template<typename T>
+        void
+        initDualMemWrite(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
+        {
+            Wavefront *wf = gpuDynInst->wavefront();
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (gpuDynInst->exec_mask[lane]) {
+                    Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
+                    Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
+                    wf->ldsChunk->write<T>(vaddr0, (reinterpret_cast<T*>(
+                        gpuDynInst->d_data))[lane * 2]);
+                    wf->ldsChunk->write<T>(vaddr1, (reinterpret_cast<T*>(
+                        gpuDynInst->d_data))[lane * 2 + 1]);
+                }
+            }
+        }
+
+        void
+        calcAddr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &addr)
+        {
+            Wavefront *wf = gpuDynInst->wavefront();
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    gpuDynInst->addr.at(lane) = (Addr)addr[lane];
+                }
+            }
+        }
+
+        // first instruction DWORD
+        InFmt_DS instData;
+        // second instruction DWORD
+        InFmt_DS_1 extData;
+    }; // Inst_DS
+
+    class Inst_MUBUF : public VEGAGPUStaticInst
+    {
+      public:
+        Inst_MUBUF(InFmt_MUBUF*, const std::string &opcode);
+        ~Inst_MUBUF();
+
+        int instSize() const override;
+        void generateDisassembly() override;
+
+        bool isScalarRegister(int opIdx) override;
+        bool isVectorRegister(int opIdx) override;
+        int getRegisterIndex(int opIdx, int num_scalar_regs) override;
+
+      protected:
+        template<typename T>
+        void
+        initMemRead(GPUDynInstPtr gpuDynInst)
+        {
+            // temporarily modify exec_mask to supress memory accesses to oob
+            // regions.  Only issue memory requests for lanes that have their
+            // exec_mask set and are not out of bounds.
+            VectorMask old_exec_mask = gpuDynInst->exec_mask;
+            gpuDynInst->exec_mask &= ~oobMask;
+            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
+            gpuDynInst->exec_mask = old_exec_mask;
+        }
+
+
+        template<int N>
+        void
+        initMemRead(GPUDynInstPtr gpuDynInst)
+        {
+            // temporarily modify exec_mask to supress memory accesses to oob
+            // regions.  Only issue memory requests for lanes that have their
+            // exec_mask set and are not out of bounds.
+            VectorMask old_exec_mask = gpuDynInst->exec_mask;
+            gpuDynInst->exec_mask &= ~oobMask;
+            initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
+            gpuDynInst->exec_mask = old_exec_mask;
+        }
+
+        template<typename T>
+        void
+        initMemWrite(GPUDynInstPtr gpuDynInst)
+        {
+            // temporarily modify exec_mask to supress memory accesses to oob
+            // regions.  Only issue memory requests for lanes that have their
+            // exec_mask set and are not out of bounds.
+            VectorMask old_exec_mask = gpuDynInst->exec_mask;
+            gpuDynInst->exec_mask &= ~oobMask;
+            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
+            gpuDynInst->exec_mask = old_exec_mask;
+        }
+
+        template<int N>
+        void
+        initMemWrite(GPUDynInstPtr gpuDynInst)
+        {
+            // temporarily modify exec_mask to supress memory accesses to oob
+            // regions.  Only issue memory requests for lanes that have their
+            // exec_mask set and are not out of bounds.
+            VectorMask old_exec_mask = gpuDynInst->exec_mask;
+            gpuDynInst->exec_mask &= ~oobMask;
+            initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
+            gpuDynInst->exec_mask = old_exec_mask;
+        }
+
+        void
+        injectGlobalMemFence(GPUDynInstPtr gpuDynInst)
+        {
+            // create request and set flags
+            gpuDynInst->resetEntireStatusVector();
+            gpuDynInst->setStatusVector(0, 1);
+            auto req = std::make_shared<Request>(0, 0, 0, 0,
+                                        gpuDynInst->computeUnit()->masterId(),
+                                        0, gpuDynInst->wfDynId);
+            gpuDynInst->setRequestFlags(req);
+            gpuDynInst->computeUnit()->
+                injectGlobalMemFence(gpuDynInst, false, req);
+        }
+
+        /**
+         * MUBUF insructions calculate their addresses as follows:
+         *
+         * index  = (IDXEN ? vgpr_idx : 0) + (const_add_tid_en ? TID : 0)
+         * offset = (OFFEN ? vgpr_off : 0) + inst_off
+         *
+         * / ====================== LINEAR ADDRESSING ====================== /
+         * VADDR = base + sgpr_off + offset + stride * index
+         *
+         * / ===================== SWIZZLED ADDRESSING ===================== /
+         * index_msb  = index / const_index_stride
+         * index_lsb  = index % const_index_stride
+         * offset_msb = offset / const_element_size
+         * offset_lsb = offset % const_element_size
+         * buffer_offset = ((index_msb * stride + offset_msb *
+         *                  const_element_size) * const_index_stride +
+         *                  index_lsb * const_element_size + offset_lsb)
+         *
+         * VADDR = base + sgpr_off + buffer_offset
+         */
+        template<typename VOFF, typename VIDX, typename SRSRC, typename SOFF>
+        void
+        calcAddr(GPUDynInstPtr gpuDynInst, VOFF v_off, VIDX v_idx,
+            SRSRC s_rsrc_desc, SOFF s_offset, int inst_offset)
+        {
+            Addr vaddr = 0;
+            Addr base_addr = 0;
+            Addr stride = 0;
+            Addr buf_idx = 0;
+            Addr buf_off = 0;
+            BufferRsrcDescriptor rsrc_desc;
+
+            std::memcpy((void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(),
+                sizeof(BufferRsrcDescriptor));
+
+            base_addr = rsrc_desc.baseAddr;
+
+            stride = rsrc_desc.addTidEn ? ((rsrc_desc.dataFmt << 14)
+                + rsrc_desc.stride) : rsrc_desc.stride;
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (gpuDynInst->exec_mask[lane]) {
+                    vaddr = base_addr + s_offset.rawData();
+                    /**
+                     * first we calculate the buffer's index and offset.
+                     * these will be used for either linear or swizzled
+                     * buffers.
+                     */
+                    buf_idx = v_idx[lane] + (rsrc_desc.addTidEn ? lane : 0);
+
+                    buf_off = v_off[lane] + inst_offset;
+
+
+                    /**
+                     * Range check behavior causes out of range accesses to
+                     * to be treated differently. Out of range accesses return
+                     * 0 for loads and are ignored for stores. For
+                     * non-formatted accesses, this is done on a per-lane
+                     * basis.
+                     */
+                    if (stride == 0 || !rsrc_desc.swizzleEn) {
+                        if (buf_off + stride * buf_idx >=
+                            rsrc_desc.numRecords - s_offset.rawData()) {
+                            DPRINTF(VEGA, "mubuf out-of-bounds condition 1: "
+                                    "lane = %d, buffer_offset = %llx, "
+                                    "const_stride = %llx, "
+                                    "const_num_records = %llx\n",
+                                    lane, buf_off + stride * buf_idx,
+                                    stride, rsrc_desc.numRecords);
+                            oobMask.set(lane);
+                            continue;
+                        }
+                    }
+
+                    if (stride != 0 && rsrc_desc.swizzleEn) {
+                        if (buf_idx >= rsrc_desc.numRecords ||
+                            buf_off >= stride) {
+                            DPRINTF(VEGA, "mubuf out-of-bounds condition 2: "
+                                    "lane = %d, offset = %llx, "
+                                    "index = %llx, "
+                                    "const_num_records = %llx\n",
+                                    lane, buf_off, buf_idx,
+                                    rsrc_desc.numRecords);
+                            oobMask.set(lane);
+                            continue;
+                        }
+                    }
+
+                    if (rsrc_desc.swizzleEn) {
+                        Addr idx_stride = 8 << rsrc_desc.idxStride;
+                        Addr elem_size = 2 << rsrc_desc.elemSize;
+                        Addr idx_msb = buf_idx / idx_stride;
+                        Addr idx_lsb = buf_idx % idx_stride;
+                        Addr off_msb = buf_off / elem_size;
+                        Addr off_lsb = buf_off % elem_size;
+                        DPRINTF(VEGA, "mubuf swizzled lane %d: "
+                                "idx_stride = %llx, elem_size = %llx, "
+                                "idx_msb = %llx, idx_lsb = %llx, "
+                                "off_msb = %llx, off_lsb = %llx\n",
+                                lane, idx_stride, elem_size, idx_msb, idx_lsb,
+                                off_msb, off_lsb);
+
+                        vaddr += ((idx_msb * stride + off_msb * elem_size)
+                            * idx_stride + idx_lsb * elem_size + off_lsb);
+                    } else {
+                        vaddr += buf_off + stride * buf_idx;
+                    }
+
+                    DPRINTF(VEGA, "Calculating mubuf address for lane %d: "
+                            "vaddr = %llx, base_addr = %llx, "
+                            "stride = %llx, buf_idx = %llx, buf_off = %llx\n",
+                            lane, vaddr, base_addr, stride,
+                            buf_idx, buf_off);
+                    gpuDynInst->addr.at(lane) = vaddr;
+                }
+            }
+        }
+
+        // first instruction DWORD
+        InFmt_MUBUF instData;
+        // second instruction DWORD
+        InFmt_MUBUF_1 extData;
+        // Mask of lanes with out-of-bounds accesses.  Needs to be tracked
+        // seperately from the exec_mask so that we remember to write zero
+        // to the registers associated with out of bounds lanes.
+        VectorMask oobMask;
+    }; // Inst_MUBUF
+
+    class Inst_MTBUF : public VEGAGPUStaticInst
+    {
+      public:
+        Inst_MTBUF(InFmt_MTBUF*, const std::string &opcode);
+        ~Inst_MTBUF();
+
+        int instSize() const override;
+
+      protected:
+        // first instruction DWORD
+        InFmt_MTBUF instData;
+        // second instruction DWORD
+        InFmt_MTBUF_1 extData;
+
+      private:
+        bool hasSecondDword(InFmt_MTBUF *);
+    }; // Inst_MTBUF
+
+    class Inst_MIMG : public VEGAGPUStaticInst
+    {
+      public:
+        Inst_MIMG(InFmt_MIMG*, const std::string &opcode);
+        ~Inst_MIMG();
+
+        int instSize() const override;
+
+      protected:
+        // first instruction DWORD
+        InFmt_MIMG instData;
+        // second instruction DWORD
+        InFmt_MIMG_1 extData;
+    }; // Inst_MIMG
+
+    class Inst_EXP : public VEGAGPUStaticInst
+    {
+      public:
+        Inst_EXP(InFmt_EXP*, const std::string &opcode);
+        ~Inst_EXP();
+
+        int instSize() const override;
+
+      protected:
+        // first instruction DWORD
+        InFmt_EXP instData;
+        // second instruction DWORD
+        InFmt_EXP_1 extData;
+    }; // Inst_EXP
+
+    class Inst_FLAT : public VEGAGPUStaticInst
+    {
+      public:
+        Inst_FLAT(InFmt_FLAT*, const std::string &opcode);
+        ~Inst_FLAT();
+
+        int instSize() const override;
+        void generateDisassembly() override;
+
+        bool isScalarRegister(int opIdx) override;
+        bool isVectorRegister(int opIdx) override;
+        int getRegisterIndex(int opIdx, int num_scalar_regs) override;
+
+      protected:
+        template<typename T>
+        void
+        initMemRead(GPUDynInstPtr gpuDynInst)
+        {
+            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
+        }
+
+        template<int N>
+        void
+        initMemRead(GPUDynInstPtr gpuDynInst)
+        {
+            initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
+        }
+
+        template<typename T>
+        void
+        initMemWrite(GPUDynInstPtr gpuDynInst)
+        {
+            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
+        }
+
+        template<int N>
+        void
+        initMemWrite(GPUDynInstPtr gpuDynInst)
+        {
+            initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
+        }
+
+        template<typename T>
+        void
+        initAtomicAccess(GPUDynInstPtr gpuDynInst)
+        {
+            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
+        }
+
+        void
+        calcAddr(GPUDynInstPtr gpuDynInst, ConstVecOperandU64 &addr)
+        {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (gpuDynInst->exec_mask[lane]) {
+                    gpuDynInst->addr.at(lane) = addr[lane];
+                }
+            }
+            gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
+        }
+
+        // first instruction DWORD
+        InFmt_FLAT instData;
+        // second instruction DWORD
+        InFmt_FLAT_1 extData;
+    }; // Inst_FLAT
+} // namespace VegaISA
+
+#endif // __ARCH_VEGA_INSTS_OP_ENCODINGS_HH__
--- a/src/arch/amdgpu/vega/isa.cc
+++ b/src/arch/amdgpu/vega/isa.cc
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2016-2021 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/gpu_isa.hh"
+
+#include <numeric>
+
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/wavefront.hh"
+
+namespace VegaISA
+{
+    GPUISA::GPUISA(Wavefront &wf) : wavefront(wf), m0(0)
+    {
+    }
+
+    ScalarRegU32
+    GPUISA::readMiscReg(int opIdx) const
+    {
+        switch (opIdx) {
+          case REG_M0:
+            return m0;
+          case REG_ZERO:
+            return 0;
+          case REG_SCC:
+            return statusReg.SCC;
+          default:
+            fatal("attempting to read from unsupported or non-readable "
+                  "register. selector val: %i\n", opIdx);
+            return 0;
+        }
+    }
+
+    void
+    GPUISA::writeMiscReg(int opIdx, ScalarRegU32 operandVal)
+    {
+        switch (opIdx) {
+          case REG_M0:
+            m0 = operandVal;
+            break;
+          case REG_SCC:
+            statusReg.SCC = operandVal ? 1 : 0;
+            break;
+          default:
+            fatal("attempting to write to an unsupported or non-writable "
+                  "register. selector val: %i\n", opIdx);
+            break;
+        }
+    }
+
+    void
+    GPUISA::advancePC(GPUDynInstPtr gpuDynInst)
+    {
+        wavefront.pc(wavefront.pc()
+                     + gpuDynInst->staticInstruction()->instSize());
+    }
+
+    const std::array<const ScalarRegU32, NumPosConstRegs>
+        GPUISA::posConstRegs = { {
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+            20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
+            37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
+            54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64
+        } };
+
+    const std::array<const ScalarRegI32, NumNegConstRegs>
+        GPUISA::negConstRegs = { {
+            -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
+            -16
+        } };
+} // namespace VegaISA
--- a/src/arch/amdgpu/vega/operand.hh
+++ b/src/arch/amdgpu/vega/operand.hh
@@ -0,0 +1,740 @@
+/*
+ * Copyright (c) 2017-2021 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_VEGA_OPERAND_HH__
+#define __ARCH_VEGA_OPERAND_HH__
+
+#include <array>
+
+#include "arch/amdgpu/vega/gpu_registers.hh"
+#include "arch/generic/vec_reg.hh"
+#include "gpu-compute/scalar_register_file.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+
+/**
+ * classes that represnt vector/scalar operands in VEGA ISA. these classes
+ * wrap the generic vector register type (i.e., src/arch/generic/vec_reg.hh)
+ * and allow them to be manipulated in ways that are unique to VEGA insts.
+ */
+
+namespace VegaISA
+{
+    /**
+     * convenience traits so we can automatically infer the correct FP type
+     * without looking at the number of dwords (i.e., to determine if we
+     * need a float or a double when creating FP constants).
+     */
+    template<typename T> struct OpTraits { typedef float FloatT; };
+    template<> struct OpTraits<ScalarRegF64> { typedef double FloatT; };
+    template<> struct OpTraits<ScalarRegU64> { typedef double FloatT; };
+
+    class Operand
+    {
+      public:
+        Operand() = delete;
+
+        Operand(GPUDynInstPtr gpuDynInst, int opIdx)
+            : _gpuDynInst(gpuDynInst), _opIdx(opIdx)
+        {
+            assert(_gpuDynInst);
+            assert(_opIdx >= 0);
+        }
+
+        /**
+         * read from and write to the underlying register(s) that
+         * this operand is referring to.
+         */
+        virtual void read() = 0;
+        virtual void write() = 0;
+
+      protected:
+        /**
+         * instruction object that owns this operand
+         */
+        GPUDynInstPtr _gpuDynInst;
+        /**
+         * op selector value for this operand. note that this is not
+         * the same as the register file index, be it scalar or vector.
+         * this could refer to inline constants, system regs, or even
+         * special values.
+         */
+        int _opIdx;
+    };
+
+    template<typename DataType, bool Const, size_t NumDwords>
+    class ScalarOperand;
+
+    template<typename DataType, bool Const,
+        size_t NumDwords = sizeof(DataType) / sizeof(VecElemU32)>
+    class VecOperand final : public Operand
+    {
+      static_assert(NumDwords >= 1 && NumDwords <= MaxOperandDwords,
+            "Incorrect number of DWORDS for VEGA operand.");
+
+      public:
+        VecOperand() = delete;
+
+        VecOperand(GPUDynInstPtr gpuDynInst, int opIdx)
+            : Operand(gpuDynInst, opIdx), scalar(false), absMod(false),
+              negMod(false), scRegData(gpuDynInst, _opIdx),
+              vrfData{{ nullptr }}
+        {
+            vecReg.zero();
+        }
+
+        ~VecOperand()
+        {
+        }
+
+        /**
+         * certain vector operands can read from the vrf/srf or constants.
+         * we use this method to first determine the type of the operand,
+         * then we read from the appropriate source. if vector we read
+         * directly from the vrf. if scalar, we read in the data through
+         * the scalar operand component. this should only be used for VSRC
+         * operands.
+         */
+        void
+        readSrc()
+        {
+            if (isVectorReg(_opIdx)) {
+                _opIdx = opSelectorToRegIdx(_opIdx, _gpuDynInst->wavefront()
+                    ->reservedScalarRegs);
+                read();
+            } else {
+                readScalar();
+            }
+        }
+
+        /**
+         * read from the vrf. this should only be used by vector inst
+         * source operands that are explicitly vector (i.e., VSRC).
+         */
+        void
+        read() override
+        {
+            assert(_gpuDynInst);
+            assert(_gpuDynInst->wavefront());
+            assert(_gpuDynInst->computeUnit());
+            Wavefront *wf = _gpuDynInst->wavefront();
+            ComputeUnit *cu = _gpuDynInst->computeUnit();
+
+            for (auto i = 0; i < NumDwords; ++i) {
+                int vgprIdx = cu->registerManager.mapVgpr(wf, _opIdx + i);
+                vrfData[i] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx);
+
+                DPRINTF(GPUVRF, "Read v[%d]\n", vgprIdx);
+                cu->vrf[wf->simdId]->printReg(wf, vgprIdx);
+            }
+
+            if (NumDwords == 1) {
+                assert(vrfData[0]);
+                auto vgpr = vecReg.template as<DataType>();
+                auto reg_file_vgpr = vrfData[0]->template as<VecElemU32>();
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    std::memcpy((void*)&vgpr[lane],
+                        (void*)&reg_file_vgpr[lane], sizeof(DataType));
+                }
+            } else if (NumDwords == 2) {
+                assert(vrfData[0]);
+                assert(vrfData[1]);
+                auto vgpr = vecReg.template as<VecElemU64>();
+                auto reg_file_vgpr0 = vrfData[0]->template as<VecElemU32>();
+                auto reg_file_vgpr1 = vrfData[1]->template as<VecElemU32>();
+
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    VecElemU64 tmp_val(0);
+                    ((VecElemU32*)&tmp_val)[0] = reg_file_vgpr0[lane];
+                    ((VecElemU32*)&tmp_val)[1] = reg_file_vgpr1[lane];
+                    vgpr[lane] = tmp_val;
+                }
+            }
+        }
+
+        /**
+         * write to the vrf. we maintain a copy of the underlying vector
+         * reg(s) for this operand (i.e., vrfData/scRegData), as well as a
+         * temporary vector register representation (i.e., vecReg) of the
+         * vector register, which allows the execute() methods of instructions
+         * to easily write their operand data using operator[] regardless of
+         * their size. after the result is calculated we use write() to write
+         * the data to the actual register file storage. this allows us to do
+         * type conversion, etc., in a single call as opposed to doing it
+         * in each execute() method.
+         */
+        void
+        write() override
+        {
+            assert(_gpuDynInst);
+            assert(_gpuDynInst->wavefront());
+            assert(_gpuDynInst->computeUnit());
+            Wavefront *wf = _gpuDynInst->wavefront();
+            ComputeUnit *cu = _gpuDynInst->computeUnit();
+            VectorMask &exec_mask = _gpuDynInst->isLoad()
+                ? _gpuDynInst->exec_mask : wf->execMask();
+
+            if (NumDwords == 1) {
+                int vgprIdx = cu->registerManager.mapVgpr(wf, _opIdx);
+                vrfData[0] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx);
+                assert(vrfData[0]);
+                auto reg_file_vgpr = vrfData[0]->template as<VecElemU32>();
+                auto vgpr = vecReg.template as<DataType>();
+
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (exec_mask[lane] || _gpuDynInst->ignoreExec()) {
+                        std::memcpy((void*)&reg_file_vgpr[lane],
+                            (void*)&vgpr[lane], sizeof(DataType));
+                    }
+                }
+
+                DPRINTF(GPUVRF, "Write v[%d]\n", vgprIdx);
+                cu->vrf[wf->simdId]->printReg(wf, vgprIdx);
+            } else if (NumDwords == 2) {
+                int vgprIdx0 = cu->registerManager.mapVgpr(wf, _opIdx);
+                int vgprIdx1 = cu->registerManager.mapVgpr(wf, _opIdx + 1);
+                vrfData[0] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx0);
+                vrfData[1] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx1);
+                assert(vrfData[0]);
+                assert(vrfData[1]);
+                auto reg_file_vgpr0 = vrfData[0]->template as<VecElemU32>();
+                auto reg_file_vgpr1 = vrfData[1]->template as<VecElemU32>();
+                auto vgpr = vecReg.template as<VecElemU64>();
+
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (exec_mask[lane] || _gpuDynInst->ignoreExec()) {
+                        reg_file_vgpr0[lane] = ((VecElemU32*)&vgpr[lane])[0];
+                        reg_file_vgpr1[lane] = ((VecElemU32*)&vgpr[lane])[1];
+                    }
+                }
+
+                DPRINTF(GPUVRF, "Write v[%d:%d]\n", vgprIdx0, vgprIdx1);
+                cu->vrf[wf->simdId]->printReg(wf, vgprIdx0);
+                cu->vrf[wf->simdId]->printReg(wf, vgprIdx1);
+            }
+        }
+
+        void
+        negModifier()
+        {
+            negMod = true;
+        }
+
+        void
+        absModifier()
+        {
+            absMod = true;
+        }
+
+        /**
+         * getter [] operator. only enable if this operand is constant
+         * (i.e, a source operand) and if it can be represented using
+         * primitive types (i.e., 8b to 64b primitives).
+         */
+        template<bool Condition = (NumDwords == 1 || NumDwords == 2) && Const>
+        typename std::enable_if<Condition, const DataType>::type
+        operator[](size_t idx) const
+        {
+            assert(idx < NumVecElemPerVecReg);
+
+            if (scalar) {
+                DataType ret_val = scRegData.rawData();
+
+                if (absMod) {
+                    assert(std::is_floating_point<DataType>::value);
+                    ret_val = std::fabs(ret_val);
+                }
+
+                if (negMod) {
+                    assert(std::is_floating_point<DataType>::value);
+                    ret_val = -ret_val;
+                }
+
+                return ret_val;
+            } else {
+                auto vgpr = vecReg.template as<DataType>();
+                DataType ret_val = vgpr[idx];
+
+                if (absMod) {
+                    assert(std::is_floating_point<DataType>::value);
+                    ret_val = std::fabs(ret_val);
+                }
+
+                if (negMod) {
+                    assert(std::is_floating_point<DataType>::value);
+                    ret_val = -ret_val;
+                }
+
+                return ret_val;
+            }
+        }
+
+        /**
+         * setter [] operator. only enable if this operand is non-constant
+         * (i.e, a destination operand) and if it can be represented using
+         * primitive types (i.e., 8b to 64b primitives).
+         */
+        template<bool Condition = (NumDwords == 1 || NumDwords == 2) && !Const>
+        typename std::enable_if<Condition, DataType&>::type
+        operator[](size_t idx)
+        {
+            assert(!scalar);
+            assert(idx < NumVecElemPerVecReg);
+
+            return vecReg.template as<DataType>()[idx];
+        }
+
+        private:
+          /**
+           * if we determine that this operand is a scalar (reg or constant)
+           * then we read the scalar data into the scalar operand data member.
+           */
+          void
+          readScalar()
+          {
+              scalar = true;
+              scRegData.read();
+          }
+
+          using VecRegCont = typename std::conditional<NumDwords == 2,
+              VecRegContainerU64, typename std::conditional<sizeof(DataType)
+                  == sizeof(VecElemU16), VecRegContainerU16,
+                      typename std::conditional<sizeof(DataType)
+                          == sizeof(VecElemU8), VecRegContainerU8,
+                              VecRegContainerU32>::type>::type>::type;
+
+          /**
+           * whether this operand a scalar or not.
+           */
+          bool scalar;
+          /**
+           * absolute value and negative modifiers. VOP3 instructions
+           * may indicate that their input/output operands must be
+           * modified, either by taking the absolute value or negating
+           * them. these bools indicate which modifier, if any, to use.
+           */
+          bool absMod;
+          bool negMod;
+          /**
+           * this holds all the operand data in a single vector register
+           * object (i.e., if an operand is 64b, this will hold the data
+           * from both registers the operand is using).
+           */
+          VecRegCont vecReg;
+          /**
+           * for src operands that read scalars (i.e., scalar regs or
+           * a scalar constant).
+           */
+          ScalarOperand<DataType, Const, NumDwords> scRegData;
+          /**
+           * pointers to the underlyding registers (i.e., the actual
+           * registers in the register file).
+           */
+          std::array<VecRegContainerU32*, NumDwords> vrfData;
+    };
+
+    template<typename DataType, bool Const,
+        size_t NumDwords = sizeof(DataType) / sizeof(ScalarRegU32)>
+    class ScalarOperand final : public Operand
+    {
+      static_assert(NumDwords >= 1 && NumDwords <= MaxOperandDwords,
+            "Incorrect number of DWORDS for VEGA operand.");
+      public:
+        ScalarOperand() = delete;
+
+        ScalarOperand(GPUDynInstPtr gpuDynInst, int opIdx)
+            : Operand(gpuDynInst, opIdx)
+        {
+            std::memset(srfData.data(), 0, NumDwords * sizeof(ScalarRegU32));
+        }
+
+        ~ScalarOperand()
+        {
+        }
+
+        /**
+         * we store scalar data in a std::array, however if we need the
+         * full operand data we use this method to copy all elements of
+         * the scalar operand data to a single primitive container. only
+         * useful for 8b to 64b primitive types, as they are the only types
+         * that we need to perform computation on.
+         */
+        template<bool Condition = NumDwords == 1 || NumDwords == 2>
+        typename std::enable_if<Condition, DataType>::type
+        rawData() const
+        {
+            assert(sizeof(DataType) <= sizeof(srfData));
+            DataType raw_data((DataType)0);
+            std::memcpy((void*)&raw_data, (void*)srfData.data(),
+                sizeof(DataType));
+
+            return raw_data;
+        }
+
+        void*
+        rawDataPtr()
+        {
+            return (void*)srfData.data();
+        }
+
+        void
+        read() override
+        {
+            Wavefront *wf = _gpuDynInst->wavefront();
+            ComputeUnit *cu = _gpuDynInst->computeUnit();
+
+            if (!isScalarReg(_opIdx)) {
+                readSpecialVal();
+            } else {
+                for (auto i = 0; i < NumDwords; ++i) {
+                    int sgprIdx = regIdx(i);
+                    srfData[i] = cu->srf[wf->simdId]->read(sgprIdx);
+                    DPRINTF(GPUSRF, "Read s[%d]\n", sgprIdx);
+                    cu->srf[wf->simdId]->printReg(wf, sgprIdx);
+                }
+            }
+        }
+
+        void
+        write() override
+        {
+            Wavefront *wf = _gpuDynInst->wavefront();
+            ComputeUnit *cu = _gpuDynInst->computeUnit();
+
+            if (!isScalarReg(_opIdx)) {
+                if (_opIdx == REG_EXEC_LO) {
+                    ScalarRegU64 new_exec_mask_val
+                        = wf->execMask().to_ullong();
+                    if (NumDwords == 1) {
+                        std::memcpy((void*)&new_exec_mask_val,
+                            (void*)srfData.data(), sizeof(VecElemU32));
+                    } else if (NumDwords == 2) {
+                        std::memcpy((void*)&new_exec_mask_val,
+                            (void*)srfData.data(), sizeof(VecElemU64));
+                    } else {
+                        panic("Trying to write more than 2 DWORDS to EXEC\n");
+                    }
+                    VectorMask new_exec_mask(new_exec_mask_val);
+                    wf->execMask() = new_exec_mask;
+                    DPRINTF(GPUSRF, "Write EXEC\n");
+                    DPRINTF(GPUSRF, "EXEC = %#x\n", new_exec_mask_val);
+                } else if (_opIdx == REG_EXEC_HI) {
+                    /**
+                     * If we're writing only the upper half of the EXEC mask
+                     * this ought to be a single dword operand.
+                     */
+                    assert(NumDwords == 1);
+                    ScalarRegU32 new_exec_mask_hi_val(0);
+                    ScalarRegU64 new_exec_mask_val
+                        = wf->execMask().to_ullong();
+                    std::memcpy((void*)&new_exec_mask_hi_val,
+                        (void*)srfData.data(), sizeof(new_exec_mask_hi_val));
+                    replaceBits(new_exec_mask_val, 63, 32,
+                                new_exec_mask_hi_val);
+                    VectorMask new_exec_mask(new_exec_mask_val);
+                    wf->execMask() = new_exec_mask;
+                    DPRINTF(GPUSRF, "Write EXEC\n");
+                    DPRINTF(GPUSRF, "EXEC = %#x\n", new_exec_mask_val);
+                } else {
+                    _gpuDynInst->writeMiscReg(_opIdx, srfData[0]);
+                }
+            } else {
+                for (auto i = 0; i < NumDwords; ++i) {
+                    int sgprIdx = regIdx(i);
+                    auto &sgpr = cu->srf[wf->simdId]->readWriteable(sgprIdx);
+                    if (_gpuDynInst->isLoad()) {
+                        assert(sizeof(DataType) <= sizeof(ScalarRegU64));
+                        sgpr = reinterpret_cast<ScalarRegU32*>(
+                            _gpuDynInst->scalar_data)[i];
+                    } else {
+                        sgpr = srfData[i];
+                    }
+                    DPRINTF(GPUSRF, "Write s[%d]\n", sgprIdx);
+                    cu->srf[wf->simdId]->printReg(wf, sgprIdx);
+                }
+            }
+        }
+
+        /**
+         * bit access to scalar data. primarily used for setting vcc bits.
+         */
+        template<bool Condition = NumDwords == 1 || NumDwords == 2>
+        typename std::enable_if<Condition, void>::type
+        setBit(int bit, int bit_val)
+        {
+            DataType &sgpr = *((DataType*)srfData.data());
+            replaceBits(sgpr, bit, bit_val);
+        }
+
+        template<bool Condition = (NumDwords == 1 || NumDwords == 2) && !Const>
+        typename std::enable_if<Condition, ScalarOperand&>::type
+        operator=(DataType rhs)
+        {
+            std::memcpy((void*)srfData.data(), (void*)&rhs, sizeof(DataType));
+            return *this;
+        }
+
+      private:
+        /**
+         * we have determined that we are not reading our scalar operand data
+         * from the register file, so here we figure out which special value
+         * we are reading (i.e., float constant, int constant, inline
+         * constant, or various other system registers (e.g., exec mask).
+         */
+        void
+        readSpecialVal()
+        {
+            assert(NumDwords == 1 || NumDwords == 2);
+
+            switch(_opIdx) {
+              case REG_EXEC_LO:
+                {
+                    ScalarRegU64 exec_mask = _gpuDynInst->wavefront()->
+                        execMask().to_ullong();
+                    std::memcpy((void*)srfData.data(), (void*)&exec_mask,
+                        sizeof(srfData));
+                    DPRINTF(GPUSRF, "Read EXEC\n");
+                    DPRINTF(GPUSRF, "EXEC = %#x\n", exec_mask);
+                }
+                break;
+              case REG_EXEC_HI:
+                {
+                    /**
+                     * If we're reading only the upper half of the EXEC mask
+                     * this ought to be a single dword operand.
+                     */
+                    assert(NumDwords == 1);
+                    ScalarRegU64 exec_mask = _gpuDynInst->wavefront()
+                        ->execMask().to_ullong();
+
+                    ScalarRegU32 exec_mask_hi = bits(exec_mask, 63, 32);
+                    std::memcpy((void*)srfData.data(), (void*)&exec_mask_hi,
+                                sizeof(srfData));
+                    DPRINTF(GPUSRF, "Read EXEC_HI\n");
+                    DPRINTF(GPUSRF, "EXEC_HI = %#x\n", exec_mask_hi);
+                }
+                break;
+              case REG_SRC_SWDA:
+              case REG_SRC_DPP:
+              case REG_SRC_LITERAL:
+                assert(NumDwords == 1);
+                srfData[0] = _gpuDynInst->srcLiteral();
+                break;
+              case REG_POS_HALF:
+                {
+                    typename OpTraits<DataType>::FloatT pos_half = 0.5;
+                    std::memcpy((void*)srfData.data(), (void*)&pos_half,
+                        sizeof(srfData));
+
+                }
+                break;
+              case REG_NEG_HALF:
+                {
+                    typename OpTraits<DataType>::FloatT neg_half = -0.5;
+                    std::memcpy((void*)srfData.data(), (void*)&neg_half,
+                        sizeof(srfData));
+                }
+                break;
+              case REG_POS_ONE:
+                {
+                    typename OpTraits<DataType>::FloatT pos_one = 1.0;
+                    std::memcpy(srfData.data(), &pos_one, sizeof(srfData));
+                }
+                break;
+              case REG_NEG_ONE:
+                {
+                    typename OpTraits<DataType>::FloatT neg_one = -1.0;
+                    std::memcpy(srfData.data(), &neg_one, sizeof(srfData));
+                }
+                break;
+              case REG_POS_TWO:
+                {
+                    typename OpTraits<DataType>::FloatT pos_two = 2.0;
+                    std::memcpy(srfData.data(), &pos_two, sizeof(srfData));
+                }
+                break;
+              case REG_NEG_TWO:
+                {
+                    typename OpTraits<DataType>::FloatT neg_two = -2.0;
+                    std::memcpy(srfData.data(), &neg_two, sizeof(srfData));
+                }
+                break;
+              case REG_POS_FOUR:
+                {
+                    typename OpTraits<DataType>::FloatT pos_four = 4.0;
+                    std::memcpy(srfData.data(), &pos_four, sizeof(srfData));
+                }
+                break;
+              case REG_NEG_FOUR:
+                {
+                    typename OpTraits<DataType>::FloatT neg_four = -4.0;
+                    std::memcpy((void*)srfData.data(), (void*)&neg_four ,
+                        sizeof(srfData));
+                }
+                break;
+                case REG_PI:
+                {
+                    assert(sizeof(DataType) == sizeof(ScalarRegF64)
+                        || sizeof(DataType) == sizeof(ScalarRegF32));
+
+                    const ScalarRegU32 pi_u32(0x3e22f983UL);
+                    const ScalarRegU64 pi_u64(0x3fc45f306dc9c882ULL);
+
+                    if (sizeof(DataType) == sizeof(ScalarRegF64)) {
+                        std::memcpy((void*)srfData.data(),
+                            (void*)&pi_u64, sizeof(srfData));
+                    } else {
+                        std::memcpy((void*)srfData.data(),
+                            (void*)&pi_u32, sizeof(srfData));
+                    }
+                }
+                break;
+              default:
+                {
+                    assert(sizeof(DataType) <= sizeof(srfData));
+                    DataType misc_val(0);
+                    if (isConstVal(_opIdx)) {
+                        misc_val = (DataType)_gpuDynInst
+                            ->readConstVal<DataType>(_opIdx);
+                    } else {
+                        misc_val = (DataType)_gpuDynInst->readMiscReg(_opIdx);
+                    }
+                    std::memcpy((void*)srfData.data(), (void*)&misc_val,
+                                sizeof(DataType));
+                }
+            }
+        }
+
+        /**
+         * for scalars we need to do some extra work to figure out how to
+         * map the op selector to the sgpr idx because some op selectors
+         * do not map directly to the srf (i.e., vcc/flat_scratch).
+         */
+        int
+        regIdx(int dword) const
+        {
+            Wavefront *wf = _gpuDynInst->wavefront();
+            ComputeUnit *cu = _gpuDynInst->computeUnit();
+            int sgprIdx(-1);
+
+            if (_opIdx == REG_VCC_LO) {
+                sgprIdx = cu->registerManager
+                    .mapSgpr(wf, wf->reservedScalarRegs - 2 + dword);
+            } else if (_opIdx == REG_FLAT_SCRATCH_HI) {
+                sgprIdx = cu->registerManager
+                    .mapSgpr(wf, wf->reservedScalarRegs - 3 + dword);
+            } else if (_opIdx == REG_FLAT_SCRATCH_LO) {
+                assert(NumDwords == 1);
+                sgprIdx = cu->registerManager
+                    .mapSgpr(wf, wf->reservedScalarRegs - 4 + dword);
+            } else {
+                sgprIdx = cu->registerManager.mapSgpr(wf, _opIdx + dword);
+            }
+
+            assert(sgprIdx > -1);
+
+            return sgprIdx;
+        }
+
+        /**
+         * in VEGA each register is represented as a 32b unsigned value,
+         * however operands may require up to 16 registers, so we store
+         * all the individual 32b components here. for sub-dword operand
+         * we still consider them to be 1 dword because the minimum size
+         * of a register is 1 dword. this class will take care to do the
+         * proper packing/unpacking of sub-dword operands.
+         */
+        std::array<ScalarRegU32, NumDwords> srfData;
+    };
+
+    // typedefs for the various sizes/types of scalar operands
+    using ScalarOperandU8 = ScalarOperand<ScalarRegU8, false, 1>;
+    using ScalarOperandI8 = ScalarOperand<ScalarRegI8, false, 1>;
+    using ScalarOperandU16 = ScalarOperand<ScalarRegU16, false, 1>;
+    using ScalarOperandI16 = ScalarOperand<ScalarRegI16, false, 1>;
+    using ScalarOperandU32 = ScalarOperand<ScalarRegU32, false>;
+    using ScalarOperandI32 = ScalarOperand<ScalarRegI32, false>;
+    using ScalarOperandF32 = ScalarOperand<ScalarRegF32, false>;
+    using ScalarOperandU64 = ScalarOperand<ScalarRegU64, false>;
+    using ScalarOperandI64 = ScalarOperand<ScalarRegI64, false>;
+    using ScalarOperandF64 = ScalarOperand<ScalarRegF64, false>;
+    using ScalarOperandU128 = ScalarOperand<ScalarRegU32, false, 4>;
+    using ScalarOperandU256 = ScalarOperand<ScalarRegU32, false, 8>;
+    using ScalarOperandU512 = ScalarOperand<ScalarRegU32, false, 16>;
+    // non-writeable versions of scalar operands
+    using ConstScalarOperandU8 = ScalarOperand<ScalarRegU8, true, 1>;
+    using ConstScalarOperandI8 = ScalarOperand<ScalarRegI8, true, 1>;
+    using ConstScalarOperandU16 = ScalarOperand<ScalarRegU16, true, 1>;
+    using ConstScalarOperandI16 = ScalarOperand<ScalarRegI16, true, 1>;
+    using ConstScalarOperandU32 = ScalarOperand<ScalarRegU32, true>;
+    using ConstScalarOperandI32 = ScalarOperand<ScalarRegI32, true>;
+    using ConstScalarOperandF32 = ScalarOperand<ScalarRegF32, true>;
+    using ConstScalarOperandU64 = ScalarOperand<ScalarRegU64, true>;
+    using ConstScalarOperandI64 = ScalarOperand<ScalarRegI64, true>;
+    using ConstScalarOperandF64 = ScalarOperand<ScalarRegF64, true>;
+    using ConstScalarOperandU128 = ScalarOperand<ScalarRegU32, true, 4>;
+    using ConstScalarOperandU256 = ScalarOperand<ScalarRegU32, true, 8>;
+    using ConstScalarOperandU512 = ScalarOperand<ScalarRegU32, true, 16>;
+    // typedefs for the various sizes/types of vector operands
+    using VecOperandU8 = VecOperand<VecElemU8, false, 1>;
+    using VecOperandI8 = VecOperand<VecElemI8, false, 1>;
+    using VecOperandU16 = VecOperand<VecElemU16, false, 1>;
+    using VecOperandI16 = VecOperand<VecElemI16, false, 1>;
+    using VecOperandU32 = VecOperand<VecElemU32, false>;
+    using VecOperandI32 = VecOperand<VecElemI32, false>;
+    using VecOperandF32 = VecOperand<VecElemF32, false>;
+    using VecOperandU64 = VecOperand<VecElemU64, false>;
+    using VecOperandF64 = VecOperand<VecElemF64, false>;
+    using VecOperandI64 = VecOperand<VecElemI64, false>;
+    using VecOperandU96 = VecOperand<VecElemU32, false, 3>;
+    using VecOperandU128 = VecOperand<VecElemU32, false, 4>;
+    using VecOperandU256 = VecOperand<VecElemU32, false, 8>;
+    using VecOperandU512 = VecOperand<VecElemU32, false, 16>;
+    // non-writeable versions of vector operands
+    using ConstVecOperandU8 = VecOperand<VecElemU8, true, 1>;
+    using ConstVecOperandI8 = VecOperand<VecElemI8, true, 1>;
+    using ConstVecOperandU16 = VecOperand<VecElemU16, true, 1>;
+    using ConstVecOperandI16 = VecOperand<VecElemI16, true, 1>;
+    using ConstVecOperandU32 = VecOperand<VecElemU32, true>;
+    using ConstVecOperandI32 = VecOperand<VecElemI32, true>;
+    using ConstVecOperandF32 = VecOperand<VecElemF32, true>;
+    using ConstVecOperandU64 = VecOperand<VecElemU64, true>;
+    using ConstVecOperandI64 = VecOperand<VecElemI64, true>;
+    using ConstVecOperandF64 = VecOperand<VecElemF64, true>;
+    using ConstVecOperandU96 = VecOperand<VecElemU32, true, 3>;
+    using ConstVecOperandU128 = VecOperand<VecElemU32, true, 4>;
+    using ConstVecOperandU256 = VecOperand<VecElemU32, true, 8>;
+    using ConstVecOperandU512 = VecOperand<VecElemU32, true, 16>;
+}
+
+#endif // __ARCH_VEGA_OPERAND_HH__
--- a/src/arch/amdgpu/vega/registers.cc
+++ b/src/arch/amdgpu/vega/registers.cc
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/gpu_registers.hh"
+
+namespace VegaISA
+{
+    std::string
+    opSelectorToRegSym(int idx, int numRegs)
+    {
+        std::string reg_sym;
+
+        // we have an SGPR
+        if (idx <= REG_SGPR_MAX) {
+            if (numRegs > 1)
+                reg_sym = "s[" + std::to_string(idx) + ":" +
+                    std::to_string(idx + numRegs - 1) + "]";
+            else
+                reg_sym = "s" + std::to_string(idx);
+            return reg_sym;
+        } else if (idx >= REG_VGPR_MIN && idx <= REG_VGPR_MAX) {
+            if (numRegs > 1)
+                reg_sym = "v[" + std::to_string(idx - REG_VGPR_MIN) + ":" +
+                    std::to_string(idx - REG_VGPR_MIN + numRegs - 1) + "]";
+            else
+                reg_sym = "v" + std::to_string(idx - REG_VGPR_MIN);
+            return reg_sym;
+        } else if (idx >= REG_INT_CONST_POS_MIN &&
+                   idx <= REG_INT_CONST_POS_MAX) {
+            reg_sym = std::to_string(idx - REG_INT_CONST_POS_MIN + 1);
+            return reg_sym;
+        } else if (idx >= REG_INT_CONST_NEG_MIN &&
+                   idx <= REG_INT_CONST_NEG_MAX) {
+            int inline_val = -1 - (idx - REG_INT_CONST_NEG_MIN);
+            reg_sym = std::to_string(inline_val);
+            return reg_sym;
+        }
+
+        switch (idx) {
+          case REG_FLAT_SCRATCH_LO:
+            reg_sym = "flat_scratch_lo";
+            break;
+          case REG_FLAT_SCRATCH_HI:
+            reg_sym = "flat_scratch_hi";
+            break;
+          case REG_VCC_LO:
+            reg_sym = "vcc";
+            break;
+          case REG_M0:
+            reg_sym = "m0";
+            break;
+          case REG_EXEC_LO:
+            reg_sym = "exec";
+            break;
+          case REG_ZERO:
+            reg_sym = "0";
+            break;
+          case REG_POS_HALF:
+            reg_sym = "0.5";
+            break;
+          case REG_NEG_HALF:
+            reg_sym = "-0.5";
+            break;
+          case REG_POS_ONE:
+            reg_sym = "1";
+            break;
+          case REG_NEG_ONE:
+            reg_sym = "-1";
+            break;
+          case REG_POS_TWO:
+            reg_sym = "2";
+            break;
+          case REG_NEG_TWO:
+            reg_sym = "-2";
+            break;
+          case REG_POS_FOUR:
+            reg_sym = "4";
+            break;
+          case REG_NEG_FOUR:
+            reg_sym = "-4";
+            break;
+          default:
+            fatal("VEGA ISA instruction has unknown register index %u\n", idx);
+            break;
+        }
+
+        return reg_sym;
+    }
+
+    int
+    opSelectorToRegIdx(int idx, int numScalarRegs)
+    {
+        int regIdx = -1;
+
+        if (idx <= REG_SGPR_MAX) {
+            regIdx = idx;
+        } else if (idx >= REG_VGPR_MIN && idx <= REG_VGPR_MAX) {
+            regIdx = idx - REG_VGPR_MIN;
+        } else if (idx == REG_VCC_LO) {
+            /**
+             * the VCC register occupies the two highest numbered
+             * SRF entries. VCC is typically indexed by specifying
+             * VCC_LO (simply called VCC) in the instruction encoding
+             * and reading it as a 64b value so we only return the
+             * index to the lower half of the VCC register.
+             *
+             * VCC_LO = s[NUM_SGPRS - 2]
+             * VCC_HI = s[NUM_SGPRS - 1]
+             *
+             */
+            regIdx = numScalarRegs - 2;
+        } else if (idx == REG_VCC_HI) {
+            regIdx = numScalarRegs - 1;
+        } else if (idx == REG_FLAT_SCRATCH_LO) {
+            /**
+             * the FLAT_SCRATCH register occupies the two SRF entries
+             * just below VCC. FLAT_SCRATCH is typically indexed by
+             * specifying FLAT_SCRATCH_LO (simply called FLAT_SCRATCH)
+             * in the instruction encoding and reading it as a 64b value
+             * so we only return the index to the lower half of the
+             * FLAT_SCRATCH register.
+             *
+             * FLAT_SCRATCH_LO = s[NUM_SGPRS - 4]
+             * FLAT_SCRATCH_HI = s[NUM_SGPRS - 3]
+             *
+             */
+            regIdx = numScalarRegs - 4;
+        } else if (idx == REG_FLAT_SCRATCH_HI) {
+            regIdx = numScalarRegs - 3;
+        } else if (idx == REG_EXEC_LO || idx == REG_EXEC_HI) {
+            /**
+             * If the operand is the EXEC mask we just return the op
+             * selector value indicating it is the EXEC mask, which is
+             * not part of any RF. Higher-level calls will understand
+             * that this resolves to a special system register, not an
+             * index into an RF.
+             */
+            return idx;
+        }
+
+        return regIdx;
+    }
+
+    bool
+    isPosConstVal(int opIdx)
+    {
+        bool is_pos_const_val = (opIdx >= REG_INT_CONST_POS_MIN
+            && opIdx <= REG_INT_CONST_POS_MAX);
+
+        return is_pos_const_val;
+    }
+
+    bool
+    isNegConstVal(int opIdx)
+    {
+        bool is_neg_const_val = (opIdx >= REG_INT_CONST_NEG_MIN
+            && opIdx <= REG_INT_CONST_NEG_MAX);
+
+        return is_neg_const_val;
+    }
+
+    bool
+    isConstVal(int opIdx)
+    {
+        bool is_const_val = isPosConstVal(opIdx) || isNegConstVal(opIdx);
+        return is_const_val;
+    }
+
+    bool
+    isLiteral(int opIdx)
+    {
+        return opIdx == REG_SRC_LITERAL;
+    }
+
+    bool
+    isExecMask(int opIdx)
+    {
+        return opIdx == REG_EXEC_LO || opIdx == REG_EXEC_HI;
+    }
+
+    bool
+    isVccReg(int opIdx)
+    {
+        return opIdx == REG_VCC_LO || opIdx == REG_VCC_HI;
+    }
+
+    bool
+    isFlatScratchReg(int opIdx)
+    {
+        return opIdx == REG_FLAT_SCRATCH_LO || opIdx == REG_FLAT_SCRATCH_HI;
+    }
+
+    bool
+    isScalarReg(int opIdx)
+    {
+        // FLAT_SCRATCH and VCC are stored in an SGPR pair
+        if (opIdx <= REG_SGPR_MAX || opIdx == REG_FLAT_SCRATCH_LO ||
+            opIdx == REG_FLAT_SCRATCH_HI || opIdx == REG_VCC_LO ||
+            opIdx == REG_VCC_HI) {
+            return true;
+        }
+
+        return false;
+    }
+
+    bool
+    isVectorReg(int opIdx)
+    {
+        if (opIdx >= REG_VGPR_MIN && opIdx <= REG_VGPR_MAX)
+            return true;
+
+        return false;
+    }
+
+} // namespace VegaISA