arch-vega: Reorganize inst and misc files (#789)

This PR reorganizes the instructions.cc into multiple files and renames some files which do not match their corresponding header file names. The intention is to make iterating on development of these files faster.
2024-01-23 10:06:40 -08:00
parent f2916e1b2b a5757e7e01
commit 4fe6489038
26 changed files with 47291 additions and 46544 deletions
--- a/src/arch/amdgpu/vega/SConscript
+++ b/src/arch/amdgpu/vega/SConscript
@@ -49,13 +49,32 @@ Source('tlb_coalescer.cc')
 DebugFlag('GPUPTWalker', 'Debug flag for GPU page table walker')

 if env['CONF']['TARGET_GPU_ISA'] == 'vega':
-    Source('decoder.cc')
+    Source('gpu_decoder.cc')
    Source('insts/gpu_static_inst.cc')
    Source('insts/instructions.cc')
    Source('insts/op_encodings.cc')
-    Source('isa.cc')
-    Source('registers.cc')
+    Source('gpu_isa.cc')
+    Source('gpu_registers.cc')

+    Source('insts/sop2.cc')
+    Source('insts/sopk.cc')
+    Source('insts/sop1.cc')
+    Source('insts/sopc.cc')
+    Source('insts/sopp.cc')
+    Source('insts/smem.cc')
+    Source('insts/vop2.cc')
+    Source('insts/vop1.cc')
+    Source('insts/vopc.cc')
+    Source('insts/vinterp.cc')
+    Source('insts/vop3.cc')
+    Source('insts/vop3_cmp.cc')
+    Source('insts/ds.cc')
+    Source('insts/mubuf.cc')
+    Source('insts/mtbuf.cc')
+    Source('insts/mimg.cc')
+    Source('insts/exp.cc')
+    Source('insts/flat.cc')
    Source('insts/vop3p.cc')
+    Source('insts/vop3p_mai.cc')

    DebugFlag('VEGA', 'Debug flag for VEGA GPU ISA')
--- a/src/arch/amdgpu/vega/gpu_decoder.cc
+++ b/src/arch/amdgpu/vega/gpu_decoder.cc
@@ -29,9 +29,10 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */

+#include "arch/amdgpu/vega/gpu_decoder.hh"
+
 #include <vector>

-#include "arch/amdgpu/vega/gpu_decoder.hh"
 #include "arch/amdgpu/vega/insts/gpu_static_inst.hh"
 #include "arch/amdgpu/vega/insts/instructions.hh"
 #include "arch/amdgpu/vega/insts/vop3p.hh"
--- a/src/arch/amdgpu/vega/gpu_isa.cc
+++ b/src/arch/amdgpu/vega/gpu_isa.cc
--- a/src/arch/amdgpu/vega/gpu_registers.cc
+++ b/src/arch/amdgpu/vega/gpu_registers.cc
--- a/src/arch/amdgpu/vega/insts/ds.cc
+++ b/src/arch/amdgpu/vega/insts/ds.cc
--- a/src/arch/amdgpu/vega/insts/exp.cc
+++ b/src/arch/amdgpu/vega/insts/exp.cc
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_EXP__EXP class methods ---
+
+    Inst_EXP__EXP::Inst_EXP__EXP(InFmt_EXP *iFmt)
+        : Inst_EXP(iFmt, "exp")
+    {
+    } // Inst_EXP__EXP
+
+    Inst_EXP__EXP::~Inst_EXP__EXP()
+    {
+    } // ~Inst_EXP__EXP
+
+    // --- description from .arch file ---
+    // Export through SX.
+    void
+    Inst_EXP__EXP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
--- a/src/arch/amdgpu/vega/insts/flat.cc
+++ b/src/arch/amdgpu/vega/insts/flat.cc
--- a/src/arch/amdgpu/vega/insts/inst_util.hh
+++ b/src/arch/amdgpu/vega/insts/inst_util.hh
@@ -35,6 +35,7 @@
 #include <cmath>

 #include "arch/amdgpu/vega/gpu_registers.hh"
+#include "arch/amdgpu/vega/insts/gpu_static_inst.hh"

 namespace gem5
 {
@@ -315,7 +316,8 @@ namespace VegaISA
     * 0x142: broadcast 15th thread of each row to next row
     * 0x143: broadcast thread 31 to rows 2 and 3
     */
-    int dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum,
+    inline int
+    dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum,
                    int rowOffset, bool & outOfBounds)
    {
        // local variables
--- a/src/arch/amdgpu/vega/insts/instructions.cc
+++ b/src/arch/amdgpu/vega/insts/instructions.cc
--- a/src/arch/amdgpu/vega/insts/mimg.cc
+++ b/src/arch/amdgpu/vega/insts/mimg.cc
--- a/src/arch/amdgpu/vega/insts/mtbuf.cc
+++ b/src/arch/amdgpu/vega/insts/mtbuf.cc
@@ -0,0 +1,584 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_X class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_X(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_x")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_X()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
+
+    // --- description from .arch file ---
+    // Typed buffer load 1 dword with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_xy")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
+
+    // --- description from .arch file ---
+    // Typed buffer load 2 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_xyz")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
+
+    // --- description from .arch file ---
+    // Typed buffer load 3 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_xyzw")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
+        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
+
+    // --- description from .arch file ---
+    // Typed buffer load 4 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_X class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_X
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_X(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_x")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_X
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::~Inst_MTBUF__TBUFFER_STORE_FORMAT_X()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_X
+
+    // --- description from .arch file ---
+    // Typed buffer store 1 dword with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_XY class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XY(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_xy")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XY()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
+
+    // --- description from .arch file ---
+    // Typed buffer store 2 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_xyz")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
+        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
+
+    // --- description from .arch file ---
+    // Typed buffer store 3 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_xyzw")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
+        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
+
+    // --- description from .arch file ---
+    // Typed buffer store 4 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_x")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::
+        ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
+
+    // --- description from .arch file ---
+    // Typed buffer load 1 dword with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::initiateAcc(
+          GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xy")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
+        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
+
+    // --- description from .arch file ---
+    // Typed buffer load 2 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ(
+          InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xyz")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
+        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
+
+    // --- description from .arch file ---
+    // Typed buffer load 3 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW(
+          InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xyzw")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
+        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
+
+    // --- description from .arch file ---
+    // Typed buffer load 4 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_x")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
+        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
+
+    // --- description from .arch file ---
+    // Typed buffer store 1 dword with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xy")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
+        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
+
+    // --- description from .arch file ---
+    // Typed buffer store 2 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xyz")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
+        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
+
+    // --- description from .arch file ---
+    // Typed buffer store 3 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::initiateAcc(
+          GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xyzw")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
+        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
+
+    // --- description from .arch file ---
+    // Typed buffer store 4 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::execute(
+        GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
--- a/src/arch/amdgpu/vega/insts/mubuf.cc
+++ b/src/arch/amdgpu/vega/insts/mubuf.cc
--- a/src/arch/amdgpu/vega/insts/smem.cc
+++ b/src/arch/amdgpu/vega/insts/smem.cc
--- a/src/arch/amdgpu/vega/insts/sop1.cc
+++ b/src/arch/amdgpu/vega/insts/sop1.cc
--- a/src/arch/amdgpu/vega/insts/sop2.cc
+++ b/src/arch/amdgpu/vega/insts/sop2.cc
--- a/src/arch/amdgpu/vega/insts/sopc.cc
+++ b/src/arch/amdgpu/vega/insts/sopc.cc
@@ -0,0 +1,599 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_SOPC__S_CMP_EQ_I32 class methods ---
+
+    Inst_SOPC__S_CMP_EQ_I32::Inst_SOPC__S_CMP_EQ_I32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_eq_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_EQ_I32
+
+    Inst_SOPC__S_CMP_EQ_I32::~Inst_SOPC__S_CMP_EQ_I32()
+    {
+    } // ~Inst_SOPC__S_CMP_EQ_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i == S1.i).
+    void
+    Inst_SOPC__S_CMP_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() == src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_LG_I32 class methods ---
+
+    Inst_SOPC__S_CMP_LG_I32::Inst_SOPC__S_CMP_LG_I32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_lg_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_LG_I32
+
+    Inst_SOPC__S_CMP_LG_I32::~Inst_SOPC__S_CMP_LG_I32()
+    {
+    } // ~Inst_SOPC__S_CMP_LG_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i != S1.i).
+    void
+    Inst_SOPC__S_CMP_LG_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() != src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_GT_I32 class methods ---
+
+    Inst_SOPC__S_CMP_GT_I32::Inst_SOPC__S_CMP_GT_I32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_gt_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_GT_I32
+
+    Inst_SOPC__S_CMP_GT_I32::~Inst_SOPC__S_CMP_GT_I32()
+    {
+    } // ~Inst_SOPC__S_CMP_GT_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i > S1.i).
+    void
+    Inst_SOPC__S_CMP_GT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_GE_I32 class methods ---
+
+    Inst_SOPC__S_CMP_GE_I32::Inst_SOPC__S_CMP_GE_I32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_ge_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_GE_I32
+
+    Inst_SOPC__S_CMP_GE_I32::~Inst_SOPC__S_CMP_GE_I32()
+    {
+    } // ~Inst_SOPC__S_CMP_GE_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i >= S1.i).
+    void
+    Inst_SOPC__S_CMP_GE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() >= src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_LT_I32 class methods ---
+
+    Inst_SOPC__S_CMP_LT_I32::Inst_SOPC__S_CMP_LT_I32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_lt_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_LT_I32
+
+    Inst_SOPC__S_CMP_LT_I32::~Inst_SOPC__S_CMP_LT_I32()
+    {
+    } // ~Inst_SOPC__S_CMP_LT_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i < S1.i).
+    void
+    Inst_SOPC__S_CMP_LT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_LE_I32 class methods ---
+
+    Inst_SOPC__S_CMP_LE_I32::Inst_SOPC__S_CMP_LE_I32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_le_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_LE_I32
+
+    Inst_SOPC__S_CMP_LE_I32::~Inst_SOPC__S_CMP_LE_I32()
+    {
+    } // ~Inst_SOPC__S_CMP_LE_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i <= S1.i).
+    void
+    Inst_SOPC__S_CMP_LE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() <= src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_EQ_U32 class methods ---
+
+    Inst_SOPC__S_CMP_EQ_U32::Inst_SOPC__S_CMP_EQ_U32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_eq_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_EQ_U32
+
+    Inst_SOPC__S_CMP_EQ_U32::~Inst_SOPC__S_CMP_EQ_U32()
+    {
+    } // ~Inst_SOPC__S_CMP_EQ_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u == S1.u).
+    void
+    Inst_SOPC__S_CMP_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() == src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_LG_U32 class methods ---
+
+    Inst_SOPC__S_CMP_LG_U32::Inst_SOPC__S_CMP_LG_U32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_lg_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_LG_U32
+
+    Inst_SOPC__S_CMP_LG_U32::~Inst_SOPC__S_CMP_LG_U32()
+    {
+    } // ~Inst_SOPC__S_CMP_LG_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u != S1.u).
+    void
+    Inst_SOPC__S_CMP_LG_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() != src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_GT_U32 class methods ---
+
+    Inst_SOPC__S_CMP_GT_U32::Inst_SOPC__S_CMP_GT_U32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_gt_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_GT_U32
+
+    Inst_SOPC__S_CMP_GT_U32::~Inst_SOPC__S_CMP_GT_U32()
+    {
+    } // ~Inst_SOPC__S_CMP_GT_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u > S1.u).
+    void
+    Inst_SOPC__S_CMP_GT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_GE_U32 class methods ---
+
+    Inst_SOPC__S_CMP_GE_U32::Inst_SOPC__S_CMP_GE_U32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_ge_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_GE_U32
+
+    Inst_SOPC__S_CMP_GE_U32::~Inst_SOPC__S_CMP_GE_U32()
+    {
+    } // ~Inst_SOPC__S_CMP_GE_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u >= S1.u).
+    void
+    Inst_SOPC__S_CMP_GE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() >= src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_LT_U32 class methods ---
+
+    Inst_SOPC__S_CMP_LT_U32::Inst_SOPC__S_CMP_LT_U32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_lt_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_LT_U32
+
+    Inst_SOPC__S_CMP_LT_U32::~Inst_SOPC__S_CMP_LT_U32()
+    {
+    } // ~Inst_SOPC__S_CMP_LT_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u < S1.u).
+    void
+    Inst_SOPC__S_CMP_LT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_LE_U32 class methods ---
+
+    Inst_SOPC__S_CMP_LE_U32::Inst_SOPC__S_CMP_LE_U32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_le_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_LE_U32
+
+    Inst_SOPC__S_CMP_LE_U32::~Inst_SOPC__S_CMP_LE_U32()
+    {
+    } // ~Inst_SOPC__S_CMP_LE_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u <= S1.u).
+    void
+    Inst_SOPC__S_CMP_LE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() <= src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_BITCMP0_B32 class methods ---
+
+    Inst_SOPC__S_BITCMP0_B32::Inst_SOPC__S_BITCMP0_B32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_bitcmp0_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_BITCMP0_B32
+
+    Inst_SOPC__S_BITCMP0_B32::~Inst_SOPC__S_BITCMP0_B32()
+    {
+    } // ~Inst_SOPC__S_BITCMP0_B32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u[S1.u[4:0]] == 0).
+    void
+    Inst_SOPC__S_BITCMP0_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = !bits(src0.rawData(), bits(src1.rawData(), 4, 0)) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_BITCMP1_B32 class methods ---
+
+    Inst_SOPC__S_BITCMP1_B32::Inst_SOPC__S_BITCMP1_B32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_bitcmp1_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_BITCMP1_B32
+
+    Inst_SOPC__S_BITCMP1_B32::~Inst_SOPC__S_BITCMP1_B32()
+    {
+    } // ~Inst_SOPC__S_BITCMP1_B32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u[S1.u[4:0]] == 1).
+    void
+    Inst_SOPC__S_BITCMP1_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = bits(src0.rawData(), bits(src1.rawData(), 4, 0)) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_BITCMP0_B64 class methods ---
+
+    Inst_SOPC__S_BITCMP0_B64::Inst_SOPC__S_BITCMP0_B64(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_bitcmp0_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_BITCMP0_B64
+
+    Inst_SOPC__S_BITCMP0_B64::~Inst_SOPC__S_BITCMP0_B64()
+    {
+    } // ~Inst_SOPC__S_BITCMP0_B64
+
+    // --- description from .arch file ---
+    // SCC = (S0.u64[S1.u[5:0]] == 0).
+    void
+    Inst_SOPC__S_BITCMP0_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = !bits(src0.rawData(), bits(src1.rawData(), 5, 0)) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_BITCMP1_B64 class methods ---
+
+    Inst_SOPC__S_BITCMP1_B64::Inst_SOPC__S_BITCMP1_B64(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_bitcmp1_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_BITCMP1_B64
+
+    Inst_SOPC__S_BITCMP1_B64::~Inst_SOPC__S_BITCMP1_B64()
+    {
+    } // ~Inst_SOPC__S_BITCMP1_B64
+
+    // --- description from .arch file ---
+    // SCC = (S0.u64[S1.u[5:0]] == 1).
+    void
+    Inst_SOPC__S_BITCMP1_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = bits(src0.rawData(), bits(src1.rawData(), 5, 0)) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_SETVSKIP class methods ---
+
+    Inst_SOPC__S_SETVSKIP::Inst_SOPC__S_SETVSKIP(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_setvskip")
+    {
+    } // Inst_SOPC__S_SETVSKIP
+
+    Inst_SOPC__S_SETVSKIP::~Inst_SOPC__S_SETVSKIP()
+    {
+    } // ~Inst_SOPC__S_SETVSKIP
+
+    // --- description from .arch file ---
+    // VSKIP = S0.u[S1.u[4:0]].
+    // Enables and disables VSKIP mode.
+    // When VSKIP is enabled, no VOP*/M*BUF/MIMG/DS/FLAT/EXP instuctions are
+    // issued.
+    // If any vector operations are outstanding, S_WAITCNT must be issued
+    // before executing.
+    // This instruction requires one waitstate after executing (e.g. S_NOP 0).
+    // Example:
+    //     s_waitcnt 0
+    //     s_setvskip 1, 0  // Enable vskip mode.
+    //     s_nop 1
+    void
+    Inst_SOPC__S_SETVSKIP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPC__S_SET_GPR_IDX_ON class methods ---
+
+    Inst_SOPC__S_SET_GPR_IDX_ON::Inst_SOPC__S_SET_GPR_IDX_ON(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_set_gpr_idx_on")
+    {
+    } // Inst_SOPC__S_SET_GPR_IDX_ON
+
+    Inst_SOPC__S_SET_GPR_IDX_ON::~Inst_SOPC__S_SET_GPR_IDX_ON()
+    {
+    } // ~Inst_SOPC__S_SET_GPR_IDX_ON
+
+    // --- description from .arch file ---
+    // MODE.gpr_idx_en = 1;
+    // M0[7:0] = S0.u[7:0];
+    // M0[15:12] = SIMM4 (direct contents of S1 field);
+    // // Remaining bits of M0 are unmodified.
+    // Enable GPR indexing mode. Vector operations after this will perform
+    // relative GPR addressing based on the contents of M0. The structure
+    // SQ_M0_GPR_IDX_WORD may be used to decode M0.
+    // The raw contents of the S1 field are read and used to set the enable
+    // bits. S1[0] = VSRC0_REL, S1[1] = VSRC1_REL, S1[2] = VSRC2_REL and
+    // S1[3] = VDST_REL.
+    void
+    Inst_SOPC__S_SET_GPR_IDX_ON::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPC__S_CMP_EQ_U64 class methods ---
+
+    Inst_SOPC__S_CMP_EQ_U64::Inst_SOPC__S_CMP_EQ_U64(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_eq_u64")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_EQ_U64
+
+    Inst_SOPC__S_CMP_EQ_U64::~Inst_SOPC__S_CMP_EQ_U64()
+    {
+    } // ~Inst_SOPC__S_CMP_EQ_U64
+
+    // --- description from .arch file ---
+    // SCC = (S0.i64 == S1.i64).
+    void
+    Inst_SOPC__S_CMP_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() == src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_LG_U64 class methods ---
+
+    Inst_SOPC__S_CMP_LG_U64::Inst_SOPC__S_CMP_LG_U64(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_lg_u64")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_LG_U64
+
+    Inst_SOPC__S_CMP_LG_U64::~Inst_SOPC__S_CMP_LG_U64()
+    {
+    } // ~Inst_SOPC__S_CMP_LG_U64
+
+    // --- description from .arch file ---
+    // SCC = (S0.i64 != S1.i64).
+    void
+    Inst_SOPC__S_CMP_LG_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() != src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
--- a/src/arch/amdgpu/vega/insts/sopk.cc
+++ b/src/arch/amdgpu/vega/insts/sopk.cc
@@ -0,0 +1,648 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+#include "dev/amdgpu/hwreg_defines.hh"
+#include "gpu-compute/shader.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_SOPK__S_MOVK_I32 class methods ---
+
+    Inst_SOPK__S_MOVK_I32::Inst_SOPK__S_MOVK_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_movk_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_MOVK_I32
+
+    Inst_SOPK__S_MOVK_I32::~Inst_SOPK__S_MOVK_I32()
+    {
+    } // ~Inst_SOPK__S_MOVK_I32
+
+    // --- description from .arch file ---
+    // D.i = signext(SIMM16) (sign extension).
+    void
+    Inst_SOPK__S_MOVK_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        sdst = simm16;
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOPK__S_CMOVK_I32 class methods ---
+
+    Inst_SOPK__S_CMOVK_I32::Inst_SOPK__S_CMOVK_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmovk_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMOVK_I32
+
+    Inst_SOPK__S_CMOVK_I32::~Inst_SOPK__S_CMOVK_I32()
+    {
+    } // ~Inst_SOPK__S_CMOVK_I32
+
+    // --- description from .arch file ---
+    // if (SCC) then D.i = signext(SIMM16);
+    // else NOP.
+    // Conditional move with sign extension.
+    void
+    Inst_SOPK__S_CMOVK_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        scc.read();
+
+        if (scc.rawData()) {
+            sdst = simm16;
+            sdst.write();
+        }
+    } // execute
+    // --- Inst_SOPK__S_CMPK_EQ_I32 class methods ---
+
+    Inst_SOPK__S_CMPK_EQ_I32::Inst_SOPK__S_CMPK_EQ_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_eq_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_EQ_I32
+
+    Inst_SOPK__S_CMPK_EQ_I32::~Inst_SOPK__S_CMPK_EQ_I32()
+    {
+    } // ~Inst_SOPK__S_CMPK_EQ_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i == signext(SIMM16)).
+    void
+    Inst_SOPK__S_CMPK_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() == simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_LG_I32 class methods ---
+
+    Inst_SOPK__S_CMPK_LG_I32::Inst_SOPK__S_CMPK_LG_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_lg_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_LG_I32
+
+    Inst_SOPK__S_CMPK_LG_I32::~Inst_SOPK__S_CMPK_LG_I32()
+    {
+    } // ~Inst_SOPK__S_CMPK_LG_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i != signext(SIMM16)).
+    void
+    Inst_SOPK__S_CMPK_LG_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() != simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_GT_I32 class methods ---
+
+    Inst_SOPK__S_CMPK_GT_I32::Inst_SOPK__S_CMPK_GT_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_gt_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_GT_I32
+
+    Inst_SOPK__S_CMPK_GT_I32::~Inst_SOPK__S_CMPK_GT_I32()
+    {
+    } // ~Inst_SOPK__S_CMPK_GT_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i > signext(SIMM16)).
+    void
+    Inst_SOPK__S_CMPK_GT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() > simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_GE_I32 class methods ---
+
+    Inst_SOPK__S_CMPK_GE_I32::Inst_SOPK__S_CMPK_GE_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_ge_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_GE_I32
+
+    Inst_SOPK__S_CMPK_GE_I32::~Inst_SOPK__S_CMPK_GE_I32()
+    {
+    } // ~Inst_SOPK__S_CMPK_GE_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i >= signext(SIMM16)).
+    void
+    Inst_SOPK__S_CMPK_GE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() >= simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_LT_I32 class methods ---
+
+    Inst_SOPK__S_CMPK_LT_I32::Inst_SOPK__S_CMPK_LT_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_lt_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_LT_I32
+
+    Inst_SOPK__S_CMPK_LT_I32::~Inst_SOPK__S_CMPK_LT_I32()
+    {
+    } // ~Inst_SOPK__S_CMPK_LT_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i < signext(SIMM16)).
+    void
+    Inst_SOPK__S_CMPK_LT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() < simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_LE_I32 class methods ---
+
+    Inst_SOPK__S_CMPK_LE_I32::Inst_SOPK__S_CMPK_LE_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_le_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_LE_I32
+
+    Inst_SOPK__S_CMPK_LE_I32::~Inst_SOPK__S_CMPK_LE_I32()
+    {
+    } // ~Inst_SOPK__S_CMPK_LE_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i <= signext(SIMM16)).
+    void
+    Inst_SOPK__S_CMPK_LE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() <= simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_EQ_U32 class methods ---
+
+    Inst_SOPK__S_CMPK_EQ_U32::Inst_SOPK__S_CMPK_EQ_U32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_eq_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_EQ_U32
+
+    Inst_SOPK__S_CMPK_EQ_U32::~Inst_SOPK__S_CMPK_EQ_U32()
+    {
+    } // ~Inst_SOPK__S_CMPK_EQ_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u == SIMM16).
+    void
+    Inst_SOPK__S_CMPK_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
+        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() == simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_LG_U32 class methods ---
+
+    Inst_SOPK__S_CMPK_LG_U32::Inst_SOPK__S_CMPK_LG_U32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_lg_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_LG_U32
+
+    Inst_SOPK__S_CMPK_LG_U32::~Inst_SOPK__S_CMPK_LG_U32()
+    {
+    } // ~Inst_SOPK__S_CMPK_LG_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u != SIMM16).
+    void
+    Inst_SOPK__S_CMPK_LG_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
+        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() != simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_GT_U32 class methods ---
+
+    Inst_SOPK__S_CMPK_GT_U32::Inst_SOPK__S_CMPK_GT_U32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_gt_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_GT_U32
+
+    Inst_SOPK__S_CMPK_GT_U32::~Inst_SOPK__S_CMPK_GT_U32()
+    {
+    } // ~Inst_SOPK__S_CMPK_GT_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u > SIMM16).
+    void
+    Inst_SOPK__S_CMPK_GT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
+        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() > simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_GE_U32 class methods ---
+
+    Inst_SOPK__S_CMPK_GE_U32::Inst_SOPK__S_CMPK_GE_U32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_ge_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_GE_U32
+
+    Inst_SOPK__S_CMPK_GE_U32::~Inst_SOPK__S_CMPK_GE_U32()
+    {
+    } // ~Inst_SOPK__S_CMPK_GE_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u >= SIMM16).
+    void
+    Inst_SOPK__S_CMPK_GE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
+        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() >= simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_LT_U32 class methods ---
+
+    Inst_SOPK__S_CMPK_LT_U32::Inst_SOPK__S_CMPK_LT_U32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_lt_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_LT_U32
+
+    Inst_SOPK__S_CMPK_LT_U32::~Inst_SOPK__S_CMPK_LT_U32()
+    {
+    } // ~Inst_SOPK__S_CMPK_LT_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u < SIMM16).
+    void
+    Inst_SOPK__S_CMPK_LT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
+        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() < simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_LE_U32 class methods ---
+
+    Inst_SOPK__S_CMPK_LE_U32::Inst_SOPK__S_CMPK_LE_U32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_le_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_LE_U32
+
+    Inst_SOPK__S_CMPK_LE_U32::~Inst_SOPK__S_CMPK_LE_U32()
+    {
+    } // ~Inst_SOPK__S_CMPK_LE_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u <= SIMM16).
+    void
+    Inst_SOPK__S_CMPK_LE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
+        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() <= simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_ADDK_I32 class methods ---
+
+    Inst_SOPK__S_ADDK_I32::Inst_SOPK__S_ADDK_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_addk_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_ADDK_I32
+
+    Inst_SOPK__S_ADDK_I32::~Inst_SOPK__S_ADDK_I32()
+    {
+    } // ~Inst_SOPK__S_ADDK_I32
+
+    // --- description from .arch file ---
+    // D.i = D.i + signext(SIMM16);
+    // SCC = overflow.
+    void
+    Inst_SOPK__S_ADDK_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI16 simm16 = instData.SIMM16;
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = src.rawData() + (ScalarRegI32)sext<16>(simm16);
+        scc = (bits(src.rawData(), 31) == bits(simm16, 15)
+            && bits(src.rawData(), 31) != bits(sdst.rawData(), 31)) ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_MULK_I32 class methods ---
+
+    Inst_SOPK__S_MULK_I32::Inst_SOPK__S_MULK_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_mulk_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_MULK_I32
+
+    Inst_SOPK__S_MULK_I32::~Inst_SOPK__S_MULK_I32()
+    {
+    } // ~Inst_SOPK__S_MULK_I32
+
+    // --- description from .arch file ---
+    // D.i = D.i * signext(SIMM16).
+    void
+    Inst_SOPK__S_MULK_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI16 simm16 = instData.SIMM16;
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = src.rawData() * (ScalarRegI32)sext<16>(simm16);
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOPK__S_CBRANCH_I_FORK class methods ---
+
+    Inst_SOPK__S_CBRANCH_I_FORK::Inst_SOPK__S_CBRANCH_I_FORK(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cbranch_i_fork")
+    {
+        setFlag(Branch);
+    } // Inst_SOPK__S_CBRANCH_I_FORK
+
+    Inst_SOPK__S_CBRANCH_I_FORK::~Inst_SOPK__S_CBRANCH_I_FORK()
+    {
+    } // ~Inst_SOPK__S_CBRANCH_I_FORK
+
+    // --- description from .arch file ---
+    // mask_pass = S0.u64 & EXEC;
+    // mask_fail = ~S0.u64 & EXEC;
+    // target_addr = PC + signext(SIMM16 * 4) + 4;
+    // if (mask_pass == EXEC)
+    //     PC = target_addr;
+    // elsif (mask_fail == EXEC)
+    //     PC += 4;
+    // elsif (bitcount(mask_fail) < bitcount(mask_pass))
+    //     EXEC = mask_fail;
+    //     SGPR[CSP*4] = { target_addr, mask_pass };
+    //     CSP++;
+    //     PC += 4;
+    // else
+    //     EXEC = mask_pass;
+    //     SGPR[CSP*4] = { PC + 4, mask_fail };
+    //     CSP++;
+    //     PC = target_addr;
+    // end.
+    // Conditional branch using branch-stack.
+    // S0 = compare mask(vcc or any sgpr), and
+    // SIMM16 = signed DWORD branch offset relative to next instruction.
+    // See also S_CBRANCH_JOIN.
+    void
+    Inst_SOPK__S_CBRANCH_I_FORK::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPK__S_GETREG_B32 class methods ---
+
+    Inst_SOPK__S_GETREG_B32::Inst_SOPK__S_GETREG_B32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_getreg_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_GETREG_B32
+
+    Inst_SOPK__S_GETREG_B32::~Inst_SOPK__S_GETREG_B32()
+    {
+    } // ~Inst_SOPK__S_GETREG_B32
+
+    // --- description from .arch file ---
+    // D.u = hardware-reg. Read some or all of a hardware register into the
+    // LSBs of D.
+    // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
+    // is 1..32.
+    void
+    Inst_SOPK__S_GETREG_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI16 simm16 = instData.SIMM16;
+        ScalarRegU32 hwregId = simm16 & 0x3f;
+        ScalarRegU32 offset = (simm16 >> 6) & 31;
+        ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;
+
+        ScalarRegU32 hwreg =
+            gpuDynInst->computeUnit()->shader->getHwReg(hwregId);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        sdst.read();
+
+        // Store value from hardware to part of the SDST.
+        ScalarRegU32 mask = (((1U << size) - 1U) << offset);
+        sdst = (hwreg & mask) >> offset;
+        sdst.write();
+    } // execute
+    // --- Inst_SOPK__S_SETREG_B32 class methods ---
+
+    Inst_SOPK__S_SETREG_B32::Inst_SOPK__S_SETREG_B32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_setreg_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_SETREG_B32
+
+    Inst_SOPK__S_SETREG_B32::~Inst_SOPK__S_SETREG_B32()
+    {
+    } // ~Inst_SOPK__S_SETREG_B32
+
+    // --- description from .arch file ---
+    // hardware-reg = S0.u. Write some or all of the LSBs of D into a hardware
+    // register.
+    // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
+    // is 1..32.
+    void
+    Inst_SOPK__S_SETREG_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI16 simm16 = instData.SIMM16;
+        ScalarRegU32 hwregId = simm16 & 0x3f;
+        ScalarRegU32 offset = (simm16 >> 6) & 31;
+        ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;
+
+        ScalarRegU32 hwreg =
+            gpuDynInst->computeUnit()->shader->getHwReg(hwregId);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        sdst.read();
+
+        // Store value from SDST to part of the hardware register.
+        ScalarRegU32 mask = (((1U << size) - 1U) << offset);
+        hwreg = ((hwreg & ~mask) | ((sdst.rawData() << offset) & mask));
+        gpuDynInst->computeUnit()->shader->setHwReg(hwregId, hwreg);
+
+        // set MODE register to control the behavior of single precision
+        // floating-point numbers: denormal mode or round mode
+        if (hwregId==1 && size==2
+                        && (offset==4 || offset==0)) {
+            warn_once("Be cautious that s_setreg_b32 has no real effect "
+                            "on FP modes: %s\n", gpuDynInst->disassemble());
+            return;
+        }
+
+        // panic if not changing MODE of floating-point numbers
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPK__S_SETREG_IMM32_B32 class methods ---
+
+    Inst_SOPK__S_SETREG_IMM32_B32::Inst_SOPK__S_SETREG_IMM32_B32(
+          InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_setreg_imm32_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_SETREG_IMM32_B32
+
+    Inst_SOPK__S_SETREG_IMM32_B32::~Inst_SOPK__S_SETREG_IMM32_B32()
+    {
+    } // ~Inst_SOPK__S_SETREG_IMM32_B32
+
+    // --- description from .arch file ---
+    // Write some or all of the LSBs of IMM32 into a hardware register; this
+    // ---  instruction requires a 32-bit literal constant.
+    // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
+    // is 1..32.
+    void
+    Inst_SOPK__S_SETREG_IMM32_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI16 simm16 = instData.SIMM16;
+        ScalarRegU32 hwregId = simm16 & 0x3f;
+        ScalarRegU32 offset = (simm16 >> 6) & 31;
+        ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;
+
+        ScalarRegU32 hwreg =
+            gpuDynInst->computeUnit()->shader->getHwReg(hwregId);
+        ScalarRegI32 simm32 = extData.imm_u32;
+
+        // Store value from SIMM32 to part of the hardware register.
+        ScalarRegU32 mask = (((1U << size) - 1U) << offset);
+        hwreg = ((hwreg & ~mask) | ((simm32 << offset) & mask));
+        gpuDynInst->computeUnit()->shader->setHwReg(hwregId, hwreg);
+
+        // set MODE register to control the behavior of single precision
+        // floating-point numbers: denormal mode or round mode
+        if (hwregId==HW_REG_MODE && size==2
+                        && (offset==4 || offset==0)) {
+            warn_once("Be cautious that s_setreg_imm32_b32 has no real effect "
+                            "on FP modes: %s\n", gpuDynInst->disassemble());
+            return;
+        }
+
+        // panic if not changing modes of single-precision FPs
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
--- a/src/arch/amdgpu/vega/insts/sopp.cc
+++ b/src/arch/amdgpu/vega/insts/sopp.cc
@@ -0,0 +1,900 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+#include "debug/GPUSync.hh"
+#include "gpu-compute/shader.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_SOPP__S_NOP class methods ---
+
+    Inst_SOPP__S_NOP::Inst_SOPP__S_NOP(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_nop")
+    {
+        setFlag(Nop);
+    } // Inst_SOPP__S_NOP
+
+    Inst_SOPP__S_NOP::~Inst_SOPP__S_NOP()
+    {
+    } // ~Inst_SOPP__S_NOP
+
+    // --- description from .arch file ---
+    // Do nothing. Repeat NOP 1..8 times based on SIMM16[2:0] -- 0 = 1 time,
+    // 7 = 8 times.
+    // This instruction may be used to introduce wait states to resolve
+    // hazards; see the shader programming guide for details. Compare with
+    // S_SLEEP.
+    void
+    Inst_SOPP__S_NOP::execute(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_SOPP__S_ENDPGM class methods ---
+
+    Inst_SOPP__S_ENDPGM::Inst_SOPP__S_ENDPGM(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_endpgm")
+    {
+        setFlag(EndOfKernel);
+    } // Inst_SOPP__S_ENDPGM
+
+    Inst_SOPP__S_ENDPGM::~Inst_SOPP__S_ENDPGM()
+    {
+    } // ~Inst_SOPP__S_ENDPGM
+
+    // --- description from .arch file ---
+    // End of program; terminate wavefront.
+    // The hardware implicitly executes S_WAITCNT 0 before executing this
+    // ---  instruction.
+    // See S_ENDPGM_SAVED for the context-switch version of this instruction.
+    void
+    Inst_SOPP__S_ENDPGM::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ComputeUnit *cu = gpuDynInst->computeUnit();
+
+        // delete extra instructions fetched for completed work-items
+        wf->instructionBuffer.erase(wf->instructionBuffer.begin() + 1,
+            wf->instructionBuffer.end());
+
+        if (wf->pendingFetch) {
+            wf->dropFetch = true;
+        }
+
+        wf->computeUnit->fetchStage.fetchUnit(wf->simdId)
+            .flushBuf(wf->wfSlotId);
+        wf->setStatus(Wavefront::S_STOPPED);
+
+        int refCount = wf->computeUnit->getLds()
+            .decreaseRefCounter(wf->dispatchId, wf->wgId);
+
+        /**
+         * The parent WF of this instruction is exiting, therefore
+         * it should not participate in this barrier any longer. This
+         * prevents possible deadlock issues if WFs exit early.
+         */
+        int bar_id = WFBarrier::InvalidID;
+        if (wf->hasBarrier()) {
+            assert(wf->getStatus() != Wavefront::S_BARRIER);
+            bar_id = wf->barrierId();
+            assert(bar_id != WFBarrier::InvalidID);
+            wf->releaseBarrier();
+            cu->decMaxBarrierCnt(bar_id);
+            DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Exiting the "
+                    "program and decrementing max barrier count for "
+                    "barrier Id%d. New max count: %d.\n", cu->cu_id,
+                    wf->simdId, wf->wfSlotId, wf->wfDynId, bar_id,
+                    cu->maxBarrierCnt(bar_id));
+        }
+
+        DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
+            wf->computeUnit->cu_id, wf->wgId, refCount);
+
+        wf->computeUnit->registerManager->freeRegisters(wf);
+        wf->computeUnit->stats.completedWfs++;
+        wf->computeUnit->activeWaves--;
+
+        panic_if(wf->computeUnit->activeWaves < 0, "CU[%d] Active waves less "
+            "than zero\n", wf->computeUnit->cu_id);
+
+        DPRINTF(GPUExec, "Doing return for CU%d: WF[%d][%d][%d]\n",
+            wf->computeUnit->cu_id, wf->simdId, wf->wfSlotId, wf->wfDynId);
+
+        for (int i = 0; i < wf->vecReads.size(); i++) {
+            if (wf->rawDist.find(i) != wf->rawDist.end()) {
+                wf->stats.readsPerWrite.sample(wf->vecReads.at(i));
+            }
+        }
+        wf->vecReads.clear();
+        wf->rawDist.clear();
+        wf->lastInstExec = 0;
+
+        if (!refCount) {
+            /**
+             * If all WFs have finished, and hence the WG has finished,
+             * then we can free up the barrier belonging to the parent
+             * WG, but only if we actually used a barrier (i.e., more
+             * than one WF in the WG).
+             */
+            if (bar_id != WFBarrier::InvalidID) {
+                DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - All waves are "
+                        "now complete. Releasing barrier Id%d.\n", cu->cu_id,
+                        wf->simdId, wf->wfSlotId, wf->wfDynId,
+                        wf->barrierId());
+                cu->releaseBarrier(bar_id);
+            }
+
+           /**
+             * Last wavefront of the workgroup has executed return. If the
+             * workgroup is not the final one in the kernel, then simply
+             * retire it; however, if it is the final one, i.e., indicating
+             * the kernel end, then release operation (i.e., GL2 WB) is
+             * needed
+             */
+
+            //check whether the workgroup is indicating the kernel end, i.e.,
+            //the last workgroup in the kernel
+            bool kernelEnd =
+                wf->computeUnit->shader->dispatcher().isReachingKernelEnd(wf);
+
+            bool relNeeded =
+                wf->computeUnit->shader->impl_kern_end_rel;
+
+            //if it is not a kernel end, then retire the workgroup directly
+            if (!kernelEnd || !relNeeded) {
+                wf->computeUnit->shader->dispatcher().notifyWgCompl(wf);
+                wf->setStatus(Wavefront::S_STOPPED);
+                wf->computeUnit->stats.completedWGs++;
+
+                return;
+            }
+
+            /**
+             * if it is a kernel end, inject a memory sync, i.e., GL2 WB, and
+             * retire the workgroup after receving response.
+             * note that GL0V and GL1 are read only, and they just forward GL2
+             * WB request. When forwarding, GL1 send the request to all GL2 in
+             * the complex
+             */
+            setFlag(MemSync);
+            setFlag(GlobalSegment);
+            // Notify Memory System of Kernel Completion
+            // Kernel End = isKernel + isMemSync
+            wf->setStatus(Wavefront::S_RETURNING);
+            gpuDynInst->simdId = wf->simdId;
+            gpuDynInst->wfSlotId = wf->wfSlotId;
+            gpuDynInst->wfDynId = wf->wfDynId;
+
+            DPRINTF(GPUExec, "inject global memory fence for CU%d: "
+                            "WF[%d][%d][%d]\n", wf->computeUnit->cu_id,
+                            wf->simdId, wf->wfSlotId, wf->wfDynId);
+
+            // call shader to prepare the flush operations
+            wf->computeUnit->shader->prepareFlush(gpuDynInst);
+
+            wf->computeUnit->stats.completedWGs++;
+        } else {
+            wf->computeUnit->shader->dispatcher().scheduleDispatch();
+        }
+    } // execute
+
+    // --- Inst_SOPP__S_BRANCH class methods ---
+
+    Inst_SOPP__S_BRANCH::Inst_SOPP__S_BRANCH(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_branch")
+    {
+        setFlag(Branch);
+    } // Inst_SOPP__S_BRANCH
+
+    Inst_SOPP__S_BRANCH::~Inst_SOPP__S_BRANCH()
+    {
+    } // ~Inst_SOPP__S_BRANCH
+
+    // --- description from .arch file ---
+    // PC = PC + signext(SIMM16 * 4) + 4 (short jump).
+    // For a long jump, use S_SETPC.
+    void
+    Inst_SOPP__S_BRANCH::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        Addr pc = gpuDynInst->pc();
+        ScalarRegI16 simm16 = instData.SIMM16;
+
+        pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
+
+        wf->pc(pc);
+    } // execute
+    // --- Inst_SOPP__S_WAKEUP class methods ---
+
+    Inst_SOPP__S_WAKEUP::Inst_SOPP__S_WAKEUP(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_wakeup")
+    {
+    } // Inst_SOPP__S_WAKEUP
+
+    Inst_SOPP__S_WAKEUP::~Inst_SOPP__S_WAKEUP()
+    {
+    } // ~Inst_SOPP__S_WAKEUP
+
+    // --- description from .arch file ---
+    // Allow a wave to 'ping' all the other waves in its threadgroup to force
+    // them to wake up immediately from an S_SLEEP instruction. The ping is
+    // ignored if the waves are not sleeping.
+    // This allows for more efficient polling on a memory location. The waves
+    // which are polling can sit in a long S_SLEEP between memory reads, but
+    // the wave which writes the value can tell them all to wake up early now
+    // that the data is available. This is useful for fBarrier implementations
+    // (speedup).
+    // This method is also safe from races because if any wave misses the ping,
+    // everything still works fine (whoever missed it just completes their
+    // normal S_SLEEP).
+    void
+    Inst_SOPP__S_WAKEUP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_SCC0 class methods ---
+
+    Inst_SOPP__S_CBRANCH_SCC0::Inst_SOPP__S_CBRANCH_SCC0(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_scc0")
+    {
+        setFlag(Branch);
+    } // Inst_SOPP__S_CBRANCH_SCC0
+
+    Inst_SOPP__S_CBRANCH_SCC0::~Inst_SOPP__S_CBRANCH_SCC0()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_SCC0
+
+    // --- description from .arch file ---
+    // if (SCC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_SCC0::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        Addr pc = gpuDynInst->pc();
+        ScalarRegI16 simm16 = instData.SIMM16;
+        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        scc.read();
+
+        if (!scc.rawData()) {
+            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
+        }
+
+        wf->pc(pc);
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_SCC1 class methods ---
+
+    Inst_SOPP__S_CBRANCH_SCC1::Inst_SOPP__S_CBRANCH_SCC1(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_scc1")
+    {
+        setFlag(Branch);
+    } // Inst_SOPP__S_CBRANCH_SCC1
+
+    Inst_SOPP__S_CBRANCH_SCC1::~Inst_SOPP__S_CBRANCH_SCC1()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_SCC1
+
+    // --- description from .arch file ---
+    // if (SCC == 1) then PC = PC + signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_SCC1::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        Addr pc = gpuDynInst->pc();
+        ScalarRegI16 simm16 = instData.SIMM16;
+        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        scc.read();
+
+        if (scc.rawData()) {
+            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
+        }
+
+        wf->pc(pc);
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_VCCZ class methods ---
+
+    Inst_SOPP__S_CBRANCH_VCCZ::Inst_SOPP__S_CBRANCH_VCCZ(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_vccz")
+    {
+        setFlag(Branch);
+        setFlag(ReadsVCC);
+    } // Inst_SOPP__S_CBRANCH_VCCZ
+
+    Inst_SOPP__S_CBRANCH_VCCZ::~Inst_SOPP__S_CBRANCH_VCCZ()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_VCCZ
+
+    // --- description from .arch file ---
+    // if (VCC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_VCCZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+        Addr pc = gpuDynInst->pc();
+        ScalarRegI16 simm16 = instData.SIMM16;
+
+        vcc.read();
+
+        if (!vcc.rawData()) {
+            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
+        }
+
+        wf->pc(pc);
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_VCCNZ class methods ---
+
+    Inst_SOPP__S_CBRANCH_VCCNZ::Inst_SOPP__S_CBRANCH_VCCNZ(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_vccnz")
+    {
+        setFlag(Branch);
+        setFlag(ReadsVCC);
+    } // Inst_SOPP__S_CBRANCH_VCCNZ
+
+    Inst_SOPP__S_CBRANCH_VCCNZ::~Inst_SOPP__S_CBRANCH_VCCNZ()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_VCCNZ
+
+    // --- description from .arch file ---
+    // if (VCC != 0) then PC = PC + signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_VCCNZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        vcc.read();
+
+        if (vcc.rawData()) {
+            Addr pc = gpuDynInst->pc();
+            ScalarRegI16 simm16 = instData.SIMM16;
+            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
+            wf->pc(pc);
+        }
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_EXECZ class methods ---
+
+    Inst_SOPP__S_CBRANCH_EXECZ::Inst_SOPP__S_CBRANCH_EXECZ(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_execz")
+    {
+        setFlag(Branch);
+        setFlag(ReadsEXEC);
+    } // Inst_SOPP__S_CBRANCH_EXECZ
+
+    Inst_SOPP__S_CBRANCH_EXECZ::~Inst_SOPP__S_CBRANCH_EXECZ()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_EXECZ
+
+    // --- description from .arch file ---
+    // if (EXEC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_EXECZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (wf->execMask().none()) {
+            Addr pc = gpuDynInst->pc();
+            ScalarRegI16 simm16 = instData.SIMM16;
+            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
+            wf->pc(pc);
+        }
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_EXECNZ class methods ---
+
+    Inst_SOPP__S_CBRANCH_EXECNZ::Inst_SOPP__S_CBRANCH_EXECNZ(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_execnz")
+    {
+        setFlag(Branch);
+        setFlag(ReadsEXEC);
+    } // Inst_SOPP__S_CBRANCH_EXECNZ
+
+    Inst_SOPP__S_CBRANCH_EXECNZ::~Inst_SOPP__S_CBRANCH_EXECNZ()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_EXECNZ
+
+    // --- description from .arch file ---
+    // if (EXEC != 0) then PC = PC + signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_EXECNZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (wf->execMask().any()) {
+            Addr pc = gpuDynInst->pc();
+            ScalarRegI16 simm16 = instData.SIMM16;
+            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
+            wf->pc(pc);
+        }
+    } // execute
+    // --- Inst_SOPP__S_BARRIER class methods ---
+
+    Inst_SOPP__S_BARRIER::Inst_SOPP__S_BARRIER(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_barrier")
+    {
+        setFlag(MemBarrier);
+    } // Inst_SOPP__S_BARRIER
+
+    Inst_SOPP__S_BARRIER::~Inst_SOPP__S_BARRIER()
+    {
+    } // ~Inst_SOPP__S_BARRIER
+
+    // --- description from .arch file ---
+    // Synchronize waves within a threadgroup.
+    // If not all waves of the threadgroup have been created yet, waits for
+    // entire group before proceeding.
+    // If some waves in the threadgroup have already terminated, this waits on
+    // only the surviving waves.
+    // Barriers are legal inside trap handlers.
+    void
+    Inst_SOPP__S_BARRIER::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ComputeUnit *cu = gpuDynInst->computeUnit();
+
+        if (wf->hasBarrier()) {
+            int bar_id = wf->barrierId();
+            assert(wf->getStatus() == Wavefront::S_BARRIER);
+            cu->incNumAtBarrier(bar_id);
+            DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Stalling at "
+                    "barrier Id%d. %d waves now at barrier, %d waves "
+                    "remain.\n", cu->cu_id, wf->simdId, wf->wfSlotId,
+                    wf->wfDynId, bar_id, cu->numAtBarrier(bar_id),
+                    cu->numYetToReachBarrier(bar_id));
+        }
+    } // execute
+    // --- Inst_SOPP__S_SETKILL class methods ---
+
+    Inst_SOPP__S_SETKILL::Inst_SOPP__S_SETKILL(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_setkill")
+    {
+    } // Inst_SOPP__S_SETKILL
+
+    Inst_SOPP__S_SETKILL::~Inst_SOPP__S_SETKILL()
+    {
+    } // ~Inst_SOPP__S_SETKILL
+
+    // --- description from .arch file ---
+    // set KILL bit to value of SIMM16[0].
+    // Used primarily for debugging kill wave host command behavior.
+    void
+    Inst_SOPP__S_SETKILL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_WAITCNT class methods ---
+
+    Inst_SOPP__S_WAITCNT::Inst_SOPP__S_WAITCNT(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_waitcnt")
+    {
+        setFlag(ALU);
+        setFlag(Waitcnt);
+    } // Inst_SOPP__S_WAITCNT
+
+    Inst_SOPP__S_WAITCNT::~Inst_SOPP__S_WAITCNT()
+    {
+    } // ~Inst_SOPP__S_WAITCNT
+
+    // --- description from .arch file ---
+    // Wait for the counts of outstanding lds, vector-memory and
+    // ---  export/vmem-write-data to be at or below the specified levels.
+    // SIMM16[3:0] = vmcount (vector memory operations),
+    // SIMM16[6:4] = export/mem-write-data count,
+    // SIMM16[12:8] = LGKM_cnt (scalar-mem/GDS/LDS count).
+    void
+    Inst_SOPP__S_WAITCNT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 vm_cnt = 0;
+        ScalarRegI32 exp_cnt = 0;
+        ScalarRegI32 lgkm_cnt = 0;
+        vm_cnt = bits<ScalarRegI16>(instData.SIMM16, 3, 0);
+        exp_cnt = bits<ScalarRegI16>(instData.SIMM16, 6, 4);
+        lgkm_cnt = bits<ScalarRegI16>(instData.SIMM16, 12, 8);
+        gpuDynInst->wavefront()->setStatus(Wavefront::S_WAITCNT);
+        gpuDynInst->wavefront()->setWaitCnts(vm_cnt, exp_cnt, lgkm_cnt);
+    } // execute
+    // --- Inst_SOPP__S_SETHALT class methods ---
+
+    Inst_SOPP__S_SETHALT::Inst_SOPP__S_SETHALT(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_sethalt")
+    {
+    } // Inst_SOPP__S_SETHALT
+
+    Inst_SOPP__S_SETHALT::~Inst_SOPP__S_SETHALT()
+    {
+    } // ~Inst_SOPP__S_SETHALT
+
+    // --- description from .arch file ---
+    // Set HALT bit to value of SIMM16[0]; 1 = halt, 0 = resume.
+    // The halt flag is ignored while PRIV == 1 (inside trap handlers) but the
+    // shader will halt immediately after the handler returns if HALT is still
+    // set at that time.
+    void
+    Inst_SOPP__S_SETHALT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_SLEEP class methods ---
+
+    Inst_SOPP__S_SLEEP::Inst_SOPP__S_SLEEP(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_sleep")
+    {
+        setFlag(ALU);
+        setFlag(Sleep);
+    } // Inst_SOPP__S_SLEEP
+
+    Inst_SOPP__S_SLEEP::~Inst_SOPP__S_SLEEP()
+    {
+    } // ~Inst_SOPP__S_SLEEP
+
+    // --- description from .arch file ---
+    // Cause a wave to sleep for (64 * SIMM16[2:0] + 1..64) clocks.
+    // The exact amount of delay is approximate. Compare with S_NOP.
+    void
+    Inst_SOPP__S_SLEEP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)instData.SIMM16;
+        gpuDynInst->wavefront()->setStatus(Wavefront::S_STALLED_SLEEP);
+        // sleep duration is specified in multiples of 64 cycles
+        gpuDynInst->wavefront()->setSleepTime(64 * simm16);
+    } // execute
+    // --- Inst_SOPP__S_SETPRIO class methods ---
+
+    Inst_SOPP__S_SETPRIO::Inst_SOPP__S_SETPRIO(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_setprio")
+    {
+        setFlag(ALU);
+    } // Inst_SOPP__S_SETPRIO
+
+    Inst_SOPP__S_SETPRIO::~Inst_SOPP__S_SETPRIO()
+    {
+    } // ~Inst_SOPP__S_SETPRIO
+
+    // --- description from .arch file ---
+    // User settable wave priority is set to SIMM16[1:0]. 0 = lowest,
+    // 3 = highest.
+    // The overall wave priority is {SPIPrio[1:0] + UserPrio[1:0],
+    // WaveAge[3:0]}.
+    void
+    Inst_SOPP__S_SETPRIO::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegU16 simm16 = instData.SIMM16;
+        ScalarRegU32 userPrio = simm16 & 0x3;
+
+        warn_once("S_SETPRIO ignored -- Requested priority %d\n", userPrio);
+    } // execute
+    // --- Inst_SOPP__S_SENDMSG class methods ---
+
+    Inst_SOPP__S_SENDMSG::Inst_SOPP__S_SENDMSG(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_sendmsg")
+    {
+    } // Inst_SOPP__S_SENDMSG
+
+    Inst_SOPP__S_SENDMSG::~Inst_SOPP__S_SENDMSG()
+    {
+    } // ~Inst_SOPP__S_SENDMSG
+
+    // --- description from .arch file ---
+    // Send a message upstream to VGT or the interrupt handler.
+    // SIMM16[9:0] contains the message type and is documented in the shader
+    // ---  programming guide.
+    void
+    Inst_SOPP__S_SENDMSG::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_SENDMSGHALT class methods ---
+
+    Inst_SOPP__S_SENDMSGHALT::Inst_SOPP__S_SENDMSGHALT(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_sendmsghalt")
+    {
+    } // Inst_SOPP__S_SENDMSGHALT
+
+    Inst_SOPP__S_SENDMSGHALT::~Inst_SOPP__S_SENDMSGHALT()
+    {
+    } // ~Inst_SOPP__S_SENDMSGHALT
+
+    // --- description from .arch file ---
+    // Send a message and then HALT the wavefront; see S_SENDMSG for details.
+    void
+    Inst_SOPP__S_SENDMSGHALT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_TRAP class methods ---
+
+    Inst_SOPP__S_TRAP::Inst_SOPP__S_TRAP(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_trap")
+    {
+    } // Inst_SOPP__S_TRAP
+
+    Inst_SOPP__S_TRAP::~Inst_SOPP__S_TRAP()
+    {
+    } // ~Inst_SOPP__S_TRAP
+
+    // --- description from .arch file ---
+    // TrapID = SIMM16[7:0];
+    // Wait for all instructions to complete;
+    // set {TTMP1, TTMP0} = {3'h0, PCRewind[3:0], HT[0], TrapID[7:0],
+    // PC[47:0]};
+    // PC = TBA (trap base address);
+    // PRIV = 1.
+    // Enter the trap handler. This instruction may be generated internally as
+    // well in response to a host trap (HT = 1) or an exception.
+    // TrapID 0 is reserved for hardware use and should not be used in a
+    // shader-generated trap.
+    void
+    Inst_SOPP__S_TRAP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_ICACHE_INV class methods ---
+
+    Inst_SOPP__S_ICACHE_INV::Inst_SOPP__S_ICACHE_INV(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_icache_inv")
+    {
+    } // Inst_SOPP__S_ICACHE_INV
+
+    Inst_SOPP__S_ICACHE_INV::~Inst_SOPP__S_ICACHE_INV()
+    {
+    } // ~Inst_SOPP__S_ICACHE_INV
+
+    // --- description from .arch file ---
+    // Invalidate entire L1 instruction cache.
+    // You must have 12 separate S_NOP instructions or a jump/branch
+    // instruction after this instruction
+    // to ensure the SQ instruction buffer is purged.
+    void
+    Inst_SOPP__S_ICACHE_INV::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_INCPERFLEVEL class methods ---
+
+    Inst_SOPP__S_INCPERFLEVEL::Inst_SOPP__S_INCPERFLEVEL(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_incperflevel")
+    {
+    } // Inst_SOPP__S_INCPERFLEVEL
+
+    Inst_SOPP__S_INCPERFLEVEL::~Inst_SOPP__S_INCPERFLEVEL()
+    {
+    } // ~Inst_SOPP__S_INCPERFLEVEL
+
+    // --- description from .arch file ---
+    // Increment performance counter specified in SIMM16[3:0] by 1.
+    void
+    Inst_SOPP__S_INCPERFLEVEL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_DECPERFLEVEL class methods ---
+
+    Inst_SOPP__S_DECPERFLEVEL::Inst_SOPP__S_DECPERFLEVEL(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_decperflevel")
+    {
+    } // Inst_SOPP__S_DECPERFLEVEL
+
+    Inst_SOPP__S_DECPERFLEVEL::~Inst_SOPP__S_DECPERFLEVEL()
+    {
+    } // ~Inst_SOPP__S_DECPERFLEVEL
+
+    // --- description from .arch file ---
+    // Decrement performance counter specified in SIMM16[3:0] by 1.
+    void
+    Inst_SOPP__S_DECPERFLEVEL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_TTRACEDATA class methods ---
+
+    Inst_SOPP__S_TTRACEDATA::Inst_SOPP__S_TTRACEDATA(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_ttracedata")
+    {
+    } // Inst_SOPP__S_TTRACEDATA
+
+    Inst_SOPP__S_TTRACEDATA::~Inst_SOPP__S_TTRACEDATA()
+    {
+    } // ~Inst_SOPP__S_TTRACEDATA
+
+    // --- description from .arch file ---
+    // Send M0 as user data to the thread trace stream.
+    void
+    Inst_SOPP__S_TTRACEDATA::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_CDBGSYS class methods ---
+
+    Inst_SOPP__S_CBRANCH_CDBGSYS::Inst_SOPP__S_CBRANCH_CDBGSYS(
+          InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_cdbgsys")
+    {
+        setFlag(Branch);
+    } // Inst_SOPP__S_CBRANCH_CDBGSYS
+
+    Inst_SOPP__S_CBRANCH_CDBGSYS::~Inst_SOPP__S_CBRANCH_CDBGSYS()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_CDBGSYS
+
+    // --- description from .arch file ---
+    // if (conditional_debug_system != 0) then PC = PC + signext(SIMM16 * 4)
+    // + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_CDBGSYS::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_CDBGUSER class methods ---
+
+    Inst_SOPP__S_CBRANCH_CDBGUSER::Inst_SOPP__S_CBRANCH_CDBGUSER(
+          InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_cdbguser")
+    {
+        setFlag(Branch);
+    } // Inst_SOPP__S_CBRANCH_CDBGUSER
+
+    Inst_SOPP__S_CBRANCH_CDBGUSER::~Inst_SOPP__S_CBRANCH_CDBGUSER()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_CDBGUSER
+
+    // --- description from .arch file ---
+    // if (conditional_debug_user != 0) then PC = PC + signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_CDBGUSER::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER class methods ---
+
+    Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER(
+          InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_cdbgsys_or_user")
+    {
+        setFlag(Branch);
+    } // Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER
+
+    Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::
+        ~Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER
+
+    // --- description from .arch file ---
+    // if (conditional_debug_system || conditional_debug_user) then PC = PC +
+    // ---  signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER class methods ---
+
+    Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::
+        Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER(InFmt_SOPP *iFmt)
+            : Inst_SOPP(iFmt, "s_cbranch_cdbgsys_and_user")
+    {
+        setFlag(Branch);
+    } // Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER
+
+    Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::
+        ~Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER
+
+    // --- description from .arch file ---
+    // if (conditional_debug_system && conditional_debug_user) then PC = PC +
+    // ---  signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_ENDPGM_SAVED class methods ---
+
+    Inst_SOPP__S_ENDPGM_SAVED::Inst_SOPP__S_ENDPGM_SAVED(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_endpgm_saved")
+    {
+    } // Inst_SOPP__S_ENDPGM_SAVED
+
+    Inst_SOPP__S_ENDPGM_SAVED::~Inst_SOPP__S_ENDPGM_SAVED()
+    {
+    } // ~Inst_SOPP__S_ENDPGM_SAVED
+
+    // --- description from .arch file ---
+    // End of program; signal that a wave has been saved by the context-switch
+    // trap handler and terminate wavefront.
+    // The hardware implicitly executes S_WAITCNT 0 before executing this
+    // instruction.
+    // Use S_ENDPGM in all cases unless you are executing the context-switch
+    // save handler.
+    void
+    Inst_SOPP__S_ENDPGM_SAVED::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_SET_GPR_IDX_OFF class methods ---
+
+    Inst_SOPP__S_SET_GPR_IDX_OFF::Inst_SOPP__S_SET_GPR_IDX_OFF(
+          InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_set_gpr_idx_off")
+    {
+    } // Inst_SOPP__S_SET_GPR_IDX_OFF
+
+    Inst_SOPP__S_SET_GPR_IDX_OFF::~Inst_SOPP__S_SET_GPR_IDX_OFF()
+    {
+    } // ~Inst_SOPP__S_SET_GPR_IDX_OFF
+
+    // --- description from .arch file ---
+    // MODE.gpr_idx_en = 0.
+    // Clear GPR indexing mode. Vector operations after this will not perform
+    // ---  relative GPR addressing regardless of the contents of M0. This
+    // ---  instruction does not modify M0.
+    void
+    Inst_SOPP__S_SET_GPR_IDX_OFF::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_SET_GPR_IDX_MODE class methods ---
+
+    Inst_SOPP__S_SET_GPR_IDX_MODE::Inst_SOPP__S_SET_GPR_IDX_MODE(
+          InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_set_gpr_idx_mode")
+    {
+    } // Inst_SOPP__S_SET_GPR_IDX_MODE
+
+    Inst_SOPP__S_SET_GPR_IDX_MODE::~Inst_SOPP__S_SET_GPR_IDX_MODE()
+    {
+    } // ~Inst_SOPP__S_SET_GPR_IDX_MODE
+
+    // --- description from .arch file ---
+    // M0[15:12] = SIMM4.
+    // Modify the mode used for vector GPR indexing.
+    // The raw contents of the source field are read and used to set the enable
+    // bits. SIMM4[0] = VSRC0_REL, SIMM4[1] = VSRC1_REL, SIMM4[2] = VSRC2_REL
+    // and SIMM4[3] = VDST_REL.
+    void
+    Inst_SOPP__S_SET_GPR_IDX_MODE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
--- a/src/arch/amdgpu/vega/insts/vinterp.cc
+++ b/src/arch/amdgpu/vega/insts/vinterp.cc
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_VINTRP__V_INTERP_P1_F32 class methods ---
+
+    Inst_VINTRP__V_INTERP_P1_F32::Inst_VINTRP__V_INTERP_P1_F32(
+          InFmt_VINTRP *iFmt)
+        : Inst_VINTRP(iFmt, "v_interp_p1_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VINTRP__V_INTERP_P1_F32
+
+    Inst_VINTRP__V_INTERP_P1_F32::~Inst_VINTRP__V_INTERP_P1_F32()
+    {
+    } // ~Inst_VINTRP__V_INTERP_P1_F32
+
+    // --- description from .arch file ---
+    // D.f = P10 * S.f + P0; parameter interpolation (SQ translates to
+    // V_MAD_F32 for SP).
+    // CAUTION: when in HALF_LDS mode, D must not be the same GPR as S;
+    // if D == S then data corruption will occur.
+    // NOTE: In textual representations the I/J VGPR is the first source and
+    // the attribute is the second source; however in the VOP3 encoding the
+    // attribute is stored in the src0 field and the VGPR is stored in the
+    // src1 field.
+    void
+    Inst_VINTRP__V_INTERP_P1_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VINTRP__V_INTERP_P2_F32 class methods ---
+
+    Inst_VINTRP__V_INTERP_P2_F32::Inst_VINTRP__V_INTERP_P2_F32(
+          InFmt_VINTRP *iFmt)
+        : Inst_VINTRP(iFmt, "v_interp_p2_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VINTRP__V_INTERP_P2_F32
+
+    Inst_VINTRP__V_INTERP_P2_F32::~Inst_VINTRP__V_INTERP_P2_F32()
+    {
+    } // ~Inst_VINTRP__V_INTERP_P2_F32
+
+    // --- description from .arch file ---
+    // D.f = P20 * S.f + D.f; parameter interpolation (SQ translates to
+    // V_MAD_F32 for SP).
+    // NOTE: In textual representations the I/J VGPR is the first source and
+    // the attribute is the second source; however in the VOP3 encoding the
+    // attribute is stored in the src0 field and the VGPR is stored in the
+    // src1 field.
+    void
+    Inst_VINTRP__V_INTERP_P2_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VINTRP__V_INTERP_MOV_F32 class methods ---
+
+    Inst_VINTRP__V_INTERP_MOV_F32::Inst_VINTRP__V_INTERP_MOV_F32(
+          InFmt_VINTRP *iFmt)
+        : Inst_VINTRP(iFmt, "v_interp_mov_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VINTRP__V_INTERP_MOV_F32
+
+    Inst_VINTRP__V_INTERP_MOV_F32::~Inst_VINTRP__V_INTERP_MOV_F32()
+    {
+    } // ~Inst_VINTRP__V_INTERP_MOV_F32
+
+    // --- description from .arch file ---
+    // D.f = {P10,P20,P0}[S.u]; parameter load.
+    void
+    Inst_VINTRP__V_INTERP_MOV_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
--- a/src/arch/amdgpu/vega/insts/vop1.cc
+++ b/src/arch/amdgpu/vega/insts/vop1.cc
--- a/src/arch/amdgpu/vega/insts/vop2.cc
+++ b/src/arch/amdgpu/vega/insts/vop2.cc
--- a/src/arch/amdgpu/vega/insts/vop3.cc
+++ b/src/arch/amdgpu/vega/insts/vop3.cc
--- a/src/arch/amdgpu/vega/insts/vop3_cmp.cc
+++ b/src/arch/amdgpu/vega/insts/vop3_cmp.cc
--- a/src/arch/amdgpu/vega/insts/vop3p.cc
+++ b/src/arch/amdgpu/vega/insts/vop3p.cc
@@ -31,6 +31,7 @@

 #include "arch/amdgpu/vega/insts/vop3p.hh"

+#include "arch/amdgpu/vega/insts/instructions.hh"
 #include "arch/arm/insts/fplib.hh"

 namespace gem5
@@ -631,5 +632,236 @@ void Inst_VOP3P__V_ACCVGPR_WRITE::execute(GPUDynInstPtr gpuDynInst)
    vdst.write();
 }

+// --- Inst_VOP3P__V_PK_FMA_F32 class methods ---
+
+Inst_VOP3P__V_PK_FMA_F32::Inst_VOP3P__V_PK_FMA_F32(InFmt_VOP3P *iFmt)
+    : Inst_VOP3P(iFmt, "v_pk_fma_f32")
+{
+    setFlag(ALU);
+} // Inst_VOP3P__V_PK_FMA_F32
+
+Inst_VOP3P__V_PK_FMA_F32::~Inst_VOP3P__V_PK_FMA_F32()
+{
+} // ~Inst_VOP3P__V_PK_FMA_F32
+
+// D.f[63:32] = S0.f[63:32] * S1.f[63:32] + S2.f[63:32] . D.f[31:0] =
+//     S0.f[31:0] * S1.f[31:0] + S2.f[31:0] .
+void
+Inst_VOP3P__V_PK_FMA_F32::execute(GPUDynInstPtr gpuDynInst)
+{
+    // This is a special case of packed instructions which operates on
+    // 64-bit inputs/outputs and not 32-bit. U64 is used here as float
+    // values cannot use bitwise operations. Consider the U64 to imply
+    // untyped 64-bits of data.
+    Wavefront *wf = gpuDynInst->wavefront();
+    ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+    ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+    ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
+    VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+    src0.readSrc();
+    src1.readSrc();
+    src2.readSrc();
+
+    int opsel = instData.OPSEL;
+    int opsel_hi = extData.OPSEL_HI | (instData.OPSEL_HI2 << 2);
+
+    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+        if (wf->execMask(lane)) {
+            uint32_t s0l = (opsel & 1) ? bits(src0[lane], 63, 32)
+                                       : bits(src0[lane], 31, 0);
+            uint32_t s1l = (opsel & 2) ? bits(src1[lane], 63, 32)
+                                       : bits(src1[lane], 31, 0);
+            uint32_t s2l = (opsel & 4) ? bits(src2[lane], 63, 32)
+                                       : bits(src2[lane], 31, 0);
+
+            float dword1 = std::fma(*reinterpret_cast<float*>(&s0l),
+                                    *reinterpret_cast<float*>(&s1l),
+                                    *reinterpret_cast<float*>(&s2l));
+
+            uint32_t s0h = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
+                                          : bits(src0[lane], 31, 0);
+            uint32_t s1h = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
+                                          : bits(src1[lane], 31, 0);
+            uint32_t s2h = (opsel_hi & 4) ? bits(src2[lane], 63, 32)
+                                          : bits(src2[lane], 31, 0);
+
+            float dword2 = std::fma(*reinterpret_cast<float*>(&s0h),
+                                    *reinterpret_cast<float*>(&s1h),
+                                    *reinterpret_cast<float*>(&s2h));
+
+            uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
+            uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
+
+            vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;
+        }
+    }
+
+    vdst.write();
+} // execute
+// --- Inst_VOP3P__V_PK_MUL_F32 class methods ---
+
+Inst_VOP3P__V_PK_MUL_F32::Inst_VOP3P__V_PK_MUL_F32(InFmt_VOP3P *iFmt)
+    : Inst_VOP3P(iFmt, "v_pk_mul_f32")
+{
+    setFlag(ALU);
+} // Inst_VOP3P__V_PK_MUL_F32
+
+Inst_VOP3P__V_PK_MUL_F32::~Inst_VOP3P__V_PK_MUL_F32()
+{
+} // ~Inst_VOP3P__V_PK_MUL_F32
+
+// D.f[63:32] = S0.f[63:32] * S1.f[63:32] . D.f[31:0] = S0.f[31:0] *
+//              S1.f[31:0]
+void
+Inst_VOP3P__V_PK_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
+{
+    // This is a special case of packed instructions which operates on
+    // 64-bit inputs/outputs and not 32-bit. U64 is used here as float
+    // values cannot use bitwise operations. Consider the U64 to imply
+    // untyped 64-bits of data.
+    Wavefront *wf = gpuDynInst->wavefront();
+    ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+    ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+    VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+    src0.readSrc();
+    src1.readSrc();
+
+    int opsel = instData.OPSEL;
+    int opsel_hi = extData.OPSEL_HI;
+
+    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+        if (wf->execMask(lane)) {
+            uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
+                                               : bits(src0[lane], 31, 0);
+            uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
+                                               : bits(src1[lane], 31, 0);
+
+            float dword1 = *reinterpret_cast<float*>(&lower_dword)
+                         * *reinterpret_cast<float*>(&upper_dword);
+
+            lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
+                                         : bits(src0[lane], 31, 0);
+            upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
+                                         : bits(src1[lane], 31, 0);
+
+            float dword2 = *reinterpret_cast<float*>(&lower_dword)
+                         * *reinterpret_cast<float*>(&upper_dword);
+
+            uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
+            uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
+
+            vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;
+        }
+    }
+
+    vdst.write();
+} // execute
+// --- Inst_VOP3P__V_PK_ADD_F32 class methods ---
+
+Inst_VOP3P__V_PK_ADD_F32::Inst_VOP3P__V_PK_ADD_F32(InFmt_VOP3P *iFmt)
+    : Inst_VOP3P(iFmt, "v_pk_add_f32")
+{
+    setFlag(ALU);
+} // Inst_VOP3P__V_PK_ADD_F32
+
+Inst_VOP3P__V_PK_ADD_F32::~Inst_VOP3P__V_PK_ADD_F32()
+{
+} // ~Inst_VOP3P__V_PK_ADD_F32
+
+// D.f[63:32] = S0.f[63:32] + S1.f[63:32] . D.f[31:0] = S0.f[31:0] +
+//              S1.f[31:0]
+void
+Inst_VOP3P__V_PK_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
+{
+    // This is a special case of packed instructions which operates on
+    // 64-bit inputs/outputs and not 32-bit. U64 is used here as float
+    // values cannot use bitwise operations. Consider the U64 to imply
+    // untyped 64-bits of data.
+    Wavefront *wf = gpuDynInst->wavefront();
+    ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+    ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+    VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+    src0.readSrc();
+    src1.readSrc();
+
+    int opsel = instData.OPSEL;
+    int opsel_hi = extData.OPSEL_HI;
+
+    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+        if (wf->execMask(lane)) {
+            uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
+                                               : bits(src0[lane], 31, 0);
+            uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
+                                               : bits(src1[lane], 31, 0);
+
+            float dword1 = *reinterpret_cast<float*>(&lower_dword)
+                         + *reinterpret_cast<float*>(&upper_dword);
+
+            lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
+                                         : bits(src0[lane], 31, 0);
+            upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
+                                         : bits(src1[lane], 31, 0);
+
+            float dword2 = *reinterpret_cast<float*>(&lower_dword)
+                         + *reinterpret_cast<float*>(&upper_dword);
+
+            uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
+            uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
+
+            vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;
+        }
+    }
+
+    vdst.write();
+} // execute
+// --- Inst_VOP3P__V_PK_MOV_B32 class methods ---
+
+Inst_VOP3P__V_PK_MOV_B32::Inst_VOP3P__V_PK_MOV_B32(InFmt_VOP3P *iFmt)
+    : Inst_VOP3P(iFmt, "v_pk_mov_b32")
+{
+    setFlag(ALU);
+} // Inst_VOP3P__V_PK_MOV_B32
+
+Inst_VOP3P__V_PK_MOV_B32::~Inst_VOP3P__V_PK_MOV_B32()
+{
+} // ~Inst_VOP3P__V_PK_MOV_B32
+
+// D.u[63:32] = S1.u[31:0]; D.u[31:0] = S0.u[31:0].
+void
+Inst_VOP3P__V_PK_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
+{
+    // This is a special case of packed instructions which operates on
+    // 64-bit inputs/outputs and not 32-bit.
+    Wavefront *wf = gpuDynInst->wavefront();
+    ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+    ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+    VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+    src0.readSrc();
+    src1.readSrc();
+
+    // Only OPSEL[1:0] are used
+    // OPSEL[0] 0/1: Lower dest dword = lower/upper dword of src0
+
+    int opsel = instData.OPSEL;
+
+    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+        if (wf->execMask(lane)) {
+            // OPSEL[1] 0/1: Lower dest dword = lower/upper dword of src1
+            uint64_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
+                                               : bits(src0[lane], 31, 0);
+            uint64_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
+                                               : bits(src1[lane], 31, 0);
+
+            vdst[lane] = upper_dword << 32 | lower_dword;
+        }
+    }
+
+    vdst.write();
+} // execute
+
 } // namespace VegaISA
 } // namespace gem5
--- a/src/arch/amdgpu/vega/insts/vop3p_mai.cc
+++ b/src/arch/amdgpu/vega/insts/vop3p_mai.cc
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+#include "arch/amdgpu/vega/insts/vop3p.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8 class methods ---
+
+    Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8::
+        Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8(InFmt_VOP3P_MAI *iFmt)
+        : Inst_VOP3P_MAI(iFmt, "v_mfma_i32_16x16x16i8")
+    {
+        setFlag(ALU);
+    } // Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8
+
+    Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8::
+        ~Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8()
+    {
+    } // ~Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8
+
+    // D(16x16I32) = A(16x16I8) x B(16x16I8) + C(16x16I32), 1 Blocks, 8
+    // pass, srcA/srcB 1 archVgpr, srcC/D 4 accVGPR
+    void
+    Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        int acc_offset = 0;
+        if (instData.ACC_CD) {
+            warn("ACC_CD not yet implemented\n");
+        }
+
+        // int8 size allows for 4 elements per lane. At 16x16 this means 4
+        // lanes per column (A matrix) / (B matrix). This whole matrix fits
+        // in one VGPR. The C matrix with size int32 requires 4 VGPRs.
+        // Handle the C matrix by using a delta. This is set to 1 normally to
+        // move to the next VGPR (1 dword away) and 0 if the input is a scalar
+        // reg (e.g., a constant).
+        int delta = isVectorReg(extData.SRC2) ? 1 : 0;
+
+        // VecOperandI8 will read 8 bits and sign extend, so used U32 to read
+        // as "untyped" 32-bit values.
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandI32 src2a(gpuDynInst, extData.SRC2+acc_offset);
+        ConstVecOperandI32 src2b(gpuDynInst, extData.SRC2+acc_offset+1*delta);
+        ConstVecOperandI32 src2c(gpuDynInst, extData.SRC2+acc_offset+2*delta);
+        ConstVecOperandI32 src2d(gpuDynInst, extData.SRC2+acc_offset+3*delta);
+
+        VecOperandI32 vdsta(gpuDynInst, instData.VDST+acc_offset);
+        VecOperandI32 vdstb(gpuDynInst, instData.VDST+acc_offset+1);
+        VecOperandI32 vdstc(gpuDynInst, instData.VDST+acc_offset+2);
+        VecOperandI32 vdstd(gpuDynInst, instData.VDST+acc_offset+3);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2a.readSrc();
+        src2b.readSrc();
+        src2c.readSrc();
+        src2d.readSrc();
+
+        int32_t A[16][16];
+        for (int i = 0; i < 64; ++i) {
+            // src0[0:15] contains columns 1 - 4 packed for rows 0 - 15,
+            // src0[16:31] contains columns 5 - 8 packed for rows 0 - 15,
+            // src0[32:47] contains columns 9 - 12 packed for rows 0 - 15,
+            // src0[48:63] contains columns 13 - 16 packed for rows 0 - 15,
+            int row = i % 16;
+            int start_col = (i / 16) * 4;
+
+            A[row][start_col+0] = sext<8>(bits(src0[i], 7, 0));
+            A[row][start_col+1] = sext<8>(bits(src0[i], 15, 8));
+            A[row][start_col+2] = sext<8>(bits(src0[i], 23, 16));
+            A[row][start_col+3] = sext<8>(bits(src0[i], 31, 24));
+        }
+
+        int32_t B[16][16];
+        for (int i = 0; i < 64; ++i) {
+            // src1[0:15] contains rows 1 - 4 packed for columns 0 - 15
+            // src1[16:31] contains rows 5 - 8 packed for columns 0 - 15
+            // src1[32:47] contains rows 9 - 12 packed for columns 0 - 15
+            // src1[48:63] contains rows 13 - 16 packed for columns 0 - 15
+            int start_row = (i / 16) * 4;
+            int col = i % 16;
+
+            B[start_row+0][col] = sext<8>(bits(src1[i], 7, 0));
+            B[start_row+1][col] = sext<8>(bits(src1[i], 15, 8));
+            B[start_row+2][col] = sext<8>(bits(src1[i], 23, 16));
+            B[start_row+3][col] = sext<8>(bits(src1[i], 31, 24));
+        }
+
+        int32_t result[16][16];
+
+        // Load accumulation matrix C into result
+        for (int i = 0; i < 64; ++i) {
+            // src2a contains rows 0, 4, 8, 12
+            result[(i/16)*4][(i%16)] = src2a[i];
+            // src2b contains rows 1, 5, 9, 13
+            result[(i/16)*4+1][(i%16)] = src2b[i];
+            // src2c contains rows 2, 6, 10, 14
+            result[(i/16)*4+2][(i%16)] = src2c[i];
+            // src2d contains rows 3, 7, 11, 15
+            result[(i/16)*4+3][(i%16)] = src2d[i];
+        }
+
+        // Compute new result - This is (obviously) not optimized
+        for (int i = 0; i < 16; ++i) {
+            for (int j = 0; j < 16; ++j) {
+                for (int k = 0; k < 16; ++k) {
+                    result[i][j] += A[i][k] * B[k][j];
+                }
+            }
+        }
+
+        // Put result in dest VGPRs
+        for (int i = 0; i < 64; ++i) {
+            // vdsta contains rows 0, 4, 8, 12
+            vdsta[i] = result[(i/16)*4][(i%16)];
+            // vdstb contains rows 1, 5, 9, 13
+            vdstb[i] = result[(i/16)*4+1][(i%16)];
+            // vdstc contains rows 2, 6, 10, 14
+            vdstc[i] = result[(i/16)*4+2][(i%16)];
+            // vdstd contains rows 3, 7, 11, 15
+            vdstd[i] = result[(i/16)*4+3][(i%16)];
+        }
+
+        vdsta.write();
+        vdstb.write();
+        vdstc.write();
+        vdstd.write();
+    } // execute
+    // --- Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64 class methods ---
+
+    Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64::
+        Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64(InFmt_VOP3P_MAI *iFmt)
+        : Inst_VOP3P_MAI(iFmt, "v_mfma_f64_16x16x4f64")
+    {
+        setFlag(ALU);
+    } // Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64
+
+    Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64::
+        ~Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64()
+    {
+    } // ~Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64
+
+    // D(16x16F64) = A(16x4F64) x B(4x16F64) + C(16x16F64), 1 Blocks, 8
+    // pass, srcA/srcB 2 VGPR, srcC/D 8 VGPR
+    void
+    Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        int acc_offset = 0;
+        if (instData.ACC_CD) {
+            warn("ACC_CD not yet implemented\n");
+        }
+
+        // Handling of src2 is a bit tricky. The operator[] overload cannot
+        // be used for dword count > 2, and the dword count here is 8. Usually
+        // src2 is a VGPR/AccGPR, but it might also be constant. In order to
+        // use operator[] and handle constants, check for VGPR here and set
+        // a delta for each of the pairs of src2 GPRs.
+        int delta = isVectorReg(extData.SRC2) ? 2 : 0;
+
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF64 src2a(gpuDynInst, extData.SRC2+acc_offset);
+        ConstVecOperandF64 src2b(gpuDynInst, extData.SRC2+acc_offset+1*delta);
+        ConstVecOperandF64 src2c(gpuDynInst, extData.SRC2+acc_offset+2*delta);
+        ConstVecOperandF64 src2d(gpuDynInst, extData.SRC2+acc_offset+3*delta);
+
+        VecOperandF64 vdsta(gpuDynInst, instData.VDST+acc_offset);
+        VecOperandF64 vdstb(gpuDynInst, instData.VDST+acc_offset+2);
+        VecOperandF64 vdstc(gpuDynInst, instData.VDST+acc_offset+4);
+        VecOperandF64 vdstd(gpuDynInst, instData.VDST+acc_offset+6);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2a.readSrc();
+        src2b.readSrc();
+        src2c.readSrc();
+        src2d.readSrc();
+
+        double result[16][16];
+
+        // Load src2 into result. src2 is row major
+        for (int i = 0; i < 64; ++i) {
+            // src2a contains rows 0 - 3
+            result[(i/16)][(i%16)] = src2a[i];
+            // src2b contains rows 4 - 7
+            result[(i/16)+4][(i%16)] = src2b[i];
+            // src2c contains rows 8 - 11
+            result[(i/16)+8][(i%16)] = src2c[i];
+            // src2d contains rows 12 - 15
+            result[(i/16)+12][(i%16)] = src2d[i];
+        }
+
+        // Compute new result
+        for (int i = 0; i < 16; ++i) {
+            for (int j = 0; j < 16; ++j) {
+                for (int k = 0; k < 4; ++k) {
+                    // src0 is column major, src1 is row major
+                    int lane_A = 16*k + i;
+                    int lane_B = 16*k + j;
+                    result[i][j] += src0[lane_A] * src1[lane_B];
+                }
+            }
+        }
+
+        // Put result in dest VGPRs
+        for (int i = 0; i < 64; ++i) {
+            // vdsta contains rows 0 - 3
+            vdsta[i] = result[(i/16)][(i%16)];
+            // src2b contains rows 4 - 7
+            vdstb[i] = result[(i/16)+4][(i%16)];
+            // src2c contains rows 8 - 11
+            vdstc[i] = result[(i/16)+8][(i%16)];
+            // src2d contains rows 12 - 15
+            vdstd[i] = result[(i/16)+12][(i%16)];
+        }
+
+        vdsta.write();
+        vdstb.write();
+        vdstc.write();
+        vdstd.write();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
--- a/src/arch/amdgpu/vega/insts/vopc.cc
+++ b/src/arch/amdgpu/vega/insts/vopc.cc